import networkx as nx
import numpy as np
import scipy
import sklearn.metrics
from typing import Union
[docs]
def bootstrap_statistic(
y_true: np.ndarray,
y_pred: np.ndarray,
dy_true: Union[np.ndarray, None] = None,
dy_pred: Union[np.ndarray, None] = None,
ci: float = 0.95,
statistic: str = "RMSE",
nbootstrap: int = 1000,
plot_type: str = "dG",
include_true_uncertainty: bool = False,
include_pred_uncertainty: bool = False,
) -> dict:
"""Compute mean and confidence intervals of specified statistic.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
dy_true : ndarray with shape (N,) or None
Errors of true values. If None, the values are assumed to have no errors
dy_pred : ndarray with shape (N,) or None
Errors of predicted values. If None, the values are assumed to have no errors
ci : float, optional, default=0.95
Interval for confidence interval (CI)
statistic : str
Statistic, one of ['RMSE', 'MUE', 'R2', 'rho','KTAU','RAE']
nbootstrap : int, optional, default=1000
Number of bootstrap samples
plot_type : str, optional, default='dG'
'dG' or 'ddG'
include_true_uncertainty : bool, default False
whether to account for the uncertainty in y_true when bootstrapping
include_pred_uncertainty : bool, default False
whether to account for the uncertainty in y_pred when bootstrapping
Returns
-------
rmse_stats : dict of float
'mean' : mean RMSE
'stderr' : standard error
'low' : low end of CI
'high' : high end of CI
"""
def compute_statistic(y_true_sample: np.ndarray, y_pred_sample: np.ndarray, statistic: str):
"""Compute requested statistic.
Parameters
----------
y_true : ndarray with shape (N,)
True values
y_pred : ndarray with shape (N,)
Predicted values
statistic : str
Statistic, one of ['RMSE', 'MUE', 'R2', 'rho','RAE','KTAU']
"""
def calc_RAE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray):
MAE = sklearn.metrics.mean_absolute_error(y_true_sample, y_pred_sample)
mean = np.mean(y_true_sample)
MAD = np.sum([np.abs(mean - i) for i in y_true_sample]) / float(len(y_true_sample))
return MAE / MAD
def calc_RRMSE(y_true_sample: np.ndarray, y_pred_sample: np.ndarray):
rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_true_sample, y_pred_sample))
mean_exp = np.mean(y_true_sample)
mds = np.sum([(mean_exp - i) ** 2 for i in y_true_sample]) / float(len(y_true_sample))
rrmse = np.sqrt(rmse**2 / mds)
return rrmse
if statistic == "RMSE":
return np.sqrt(sklearn.metrics.mean_squared_error(y_true_sample, y_pred_sample))
elif statistic == "MUE":
return sklearn.metrics.mean_absolute_error(y_true_sample, y_pred_sample)
elif statistic == "R2":
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
y_true_sample, y_pred_sample
)
return r_value**2
elif statistic == "rho":
return scipy.stats.pearsonr(y_true_sample, y_pred_sample)[0]
elif statistic == "RAE":
return calc_RAE(y_true_sample, y_pred_sample)
elif statistic == "KTAU":
return scipy.stats.kendalltau(y_true_sample, y_pred_sample)[0]
else:
raise Exception("unknown statistic '{}'".format(statistic))
# not used?
def unique_differences(x):
"""Compute all unique differences"""
N = len(x)
return np.array([(x[i] - x[j]) for i in range(N) for j in range(N) if (i != j)])
if dy_true is None:
dy_true = np.zeros_like(y_true)
if dy_pred is None:
dy_pred = np.zeros_like(y_pred)
assert len(y_true) == len(y_pred)
assert len(y_true) == len(dy_true)
assert len(y_true) == len(dy_pred)
sample_size = len(y_true)
s_n = np.zeros(
[nbootstrap], np.float64
) # s_n[n] is the statistic computed for bootstrap sample n
for replicate in range(nbootstrap):
y_true_sample = np.zeros_like(y_true)
y_pred_sample = np.zeros_like(y_pred)
for i, j in enumerate(
np.random.choice(np.arange(sample_size), size=[sample_size], replace=True)
):
stddev_true = np.fabs(dy_true[j]) if include_true_uncertainty else 0
stddev_pred = np.fabs(dy_pred[j]) if include_pred_uncertainty else 0
y_true_sample[i] = np.random.normal(loc=y_true[j], scale=stddev_true, size=1)
y_pred_sample[i] = np.random.normal(loc=y_pred[j], scale=stddev_pred, size=1)
s_n[replicate] = compute_statistic(y_true_sample, y_pred_sample, statistic)
rmse_stats = dict()
rmse_stats["mle"] = compute_statistic(y_true, y_pred, statistic)
rmse_stats["stderr"] = np.std(s_n)
rmse_stats["mean"] = np.mean(s_n)
# TODO: Is there a canned method to do this?
s_n = np.sort(s_n)
low_frac = (1.0 - ci) / 2.0
high_frac = 1.0 - low_frac
rmse_stats["low"] = s_n[int(np.floor(nbootstrap * low_frac))]
rmse_stats["high"] = s_n[int(np.ceil(nbootstrap * high_frac))]
return rmse_stats
[docs]
def mle(
graph: nx.DiGraph, factor: str = "f_ij", node_factor: Union[str, None] = None
) -> np.ndarray:
"""
Compute maximum likelihood estimate of free energies and covariance in their estimates.
The number 'factor' is the node attribute on which the MLE will be calculated,
where d'factor' will be used as the standard error of the factor
We assume the free energy of node 0 is zero.
Reference : https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00528
Xu, Huafengraph. "Optimal measurement network of pairwise differences."
Journal of Chemical Information and Modeling 59.11 (2019): 4720-4728.
NOTE: Self-edges (edges that connect a node to itself) will be ignored.
Parameters
----------
graph :nx.Graph
The graph for which an estimate is to be computed
Each edge must have attributes 'f_ij' and 'df_ij' for the free energy and uncertainty
estimate
Will have 'bayesian_f_ij' and 'bayesian_df_ij' added to each edge
and 'bayesian_f_i' and 'bayesian_df_i' added to each node.
factor : string, default = 'f_ij'
node attribute of nx.Graph that will be used for MLE
node_factor : string, default = None
optional - provide if there is node data (i.e. absolute values) 'f_i' or 'exp_DG' to
include will expect a corresponding uncertainty 'f_di' or 'exp_dDG'
Returns
-------
f_i : np.array with shape (n_ligands,)
f_i[i] is the absolute free energy of ligand i in kT
f_i[0] = 0
C : np.array with shape (n_ligands, n_ligands)
C[i,j] is the covariance of the free energy estimates of i and j
"""
N = graph.number_of_nodes()
if node_factor is None:
f_ij = form_edge_matrix(graph, factor, action="antisymmetrize")
df_ij = form_edge_matrix(graph, factor.replace("_", "_d"), action="symmetrize")
else:
f_ij = form_edge_matrix(graph, factor, action="antisymmetrize", node_label=node_factor)
df_ij = form_edge_matrix(
graph,
factor.replace("_", "_d"),
action="symmetrize",
node_label=node_factor.replace("_", "_d"),
)
node_name_to_index = {}
for i, name in enumerate(graph.nodes()):
node_name_to_index[name] = i
# Form F matrix (Eq 4)
F_matrix = np.zeros([N, N])
for (a, b) in graph.edges:
i = node_name_to_index[a]
j = node_name_to_index[b]
if i == j:
# The MLE solver will fail if we include self-edges, so we need to omit these
continue
F_matrix[i, j] = -df_ij[i, j] ** (-2)
F_matrix[j, i] = -df_ij[i, j] ** (-2)
for n in graph.nodes:
i = node_name_to_index[n]
if df_ij[i, i] == 0.0:
F_matrix[i, i] = -np.sum(F_matrix[i, :])
else:
F_matrix[i, i] = df_ij[i, i] ** (-2) - np.sum(F_matrix[i, :])
# Form z vector (Eq 3)
z = np.zeros([N])
for n in graph.nodes:
i = node_name_to_index[n]
if df_ij[i, i] != 0.0:
z[i] = f_ij[i, i] * df_ij[i, i] ** (-2)
for (a, b) in graph.edges:
i = node_name_to_index[a]
j = node_name_to_index[b]
if i == j:
# The MLE solver will fail if we include self-edges, so we need to omit these
continue
z[i] += f_ij[i, j] * df_ij[i, j] ** (-2)
z[j] += f_ij[j, i] * df_ij[j, i] ** (-2)
# Compute MLE estimate (Eq 2)
Finv = np.linalg.pinv(F_matrix)
f_i = np.matmul(Finv, z)
# Compute uncertainty
C = Finv
return f_i, C