Source code for cinnabar.plotting

import itertools
from typing import Any, Literal

import matplotlib.pylab as plt
import networkx as nx
import numpy as np
import seaborn as sns
from adjustText import adjust_text
from openff.units import Quantity, unit

from cinnabar import conversion, plotlying, stats
from cinnabar.femap import ABSOLUTE_ANALYSIS_TYPES, RELATIVE_ANALYSIS_TYPES, FEMap

# Define the default guidelines in kcal/mol for the scatter plots
_DEFAULT_GUIDELINES_DG = (0.5, 1.0)


def _convert_guidelines_to_pic50(guidelines: tuple[float, float], temperature: Quantity) -> tuple[float, ...]:
    """
    Convert the given guidelines from kcal/mol to pIC50 values at the given temperature.

    Parameters
    ----------
    guidelines : tuple[float, float]
        The guideline distances in kcal/mol to convert.
    temperature : Quantity
        The temperature at which to perform the conversion.
    """
    converted: list[float] = []
    for g in guidelines:
        dg = Quantity(g, units=unit.kilocalorie_per_mole)
        pic50, _ = conversion.convert_observable(dg, original_type="dg", final_type="pic50", temperature=temperature)
        converted.append(abs(pic50.m))  # type: ignore[arg-type]
    return tuple(converted)


def _update_plot_kwargs(
    kwargs: dict,
    plot_meta: dict[str, str],
    observable_type: str,
    temperature: Quantity,
) -> tuple[str, str, dict]:
    """Extract and resolve quantity, units, and guidelines from kwargs for pair_plot calls."""
    quantity = kwargs.pop("quantity", plot_meta["quantity"])
    units = kwargs.pop("units", plot_meta["units"])
    if kwargs.get("guidelines", True) is True and observable_type.lower().endswith("pic50"):
        kwargs["guidelines"] = _convert_guidelines_to_pic50(guidelines=_DEFAULT_GUIDELINES_DG, temperature=temperature)
    return quantity, units, kwargs


# Map the observable type to expected dataframe column names and the units for the plot
_OBSERVABLE_METADATA: dict[str, dict[str, str]] = {
    "ddg": {
        "value_col": "DDG (kcal/mol)",
        "uncertainty_col": "uncertainty (kcal/mol)",
        "quantity": r"$\Delta\Delta$G",
        "units": r"$\mathrm{kcal\,mol^{-1}}$",
        "ecdf_quantity": r"$|\Delta\Delta$G$_{calc} - \Delta\Delta$G$_{exp}|$",
    },
    "dg": {
        "value_col": "DG (kcal/mol)",
        "uncertainty_col": "uncertainty (kcal/mol)",
        "quantity": r"$\Delta$G",
        "units": r"$\mathrm{kcal\,mol^{-1}}$",
        "ecdf_quantity": r"$|\Delta$G$_{calc} - \Delta$G$_{exp}|$",
    },
    "dpic50": {
        "value_col": "DpIC50",
        "uncertainty_col": "uncertainty (unitless)",
        "quantity": r"$\Delta$pIC50",
        "units": "unitless",
        "ecdf_quantity": r"$|\Delta$pIC50$_{calc}$ - $\Delta$pIC50$_{exp}|$",
    },
    "pic50": {
        "value_col": "pIC50",
        "uncertainty_col": "uncertainty (unitless)",
        "quantity": "pIC50",
        "units": "unitless",
        "ecdf_quantity": r"$|$pIC50$_{calc}$ - pIC50$_{exp}|$",
    },
}


[docs] def pair_plot( x: np.ndarray, y: np.ndarray, title: str = "", xerr: np.ndarray | None = None, yerr: np.ndarray | None = None, method_name: str = "", target_name: str = "", quantity: str = r"$\Delta\Delta$G", xlabel: str = "Experimental", ylabel: str = "Calculated", units: str = r"$\mathrm{kcal\,mol^{-1}}$", guidelines: bool | tuple[float, float] = True, guideline_colors: tuple[str, str] | None = None, origins: bool = True, color: str | None = None, statistics: list[Literal["RMSE", "NRMSE", "MUE", "RAE", "R2", "rho", "KTAU", "PI"]] | None = None, filename: str | None = None, centralizing: bool = True, shift: float = 0.0, figsize: float = 3.25, dpi: float | str = 300, data_labels: list[str] | None = None, axis_padding: float = 0.5, xy_lim: tuple[float, float] | None = None, font_sizes: dict[str, int] | None = None, bootstrap_x_uncertainty: bool = False, bootstrap_y_uncertainty: bool = False, statistic_type: Literal["mle", "mean"] = "mle", scatter_kwargs: dict[str, float | int | str] | None = None, ): r"""Handles the aesthetics of the pair plots in one place. Parameters ---------- x : np.ndarray Values to plot on the x-axis. y : np.ndarray Values to plot on the y-axis title : str, default "" Title for the plot. xerr : np.ndarray | None , default None Error bars for x values if available. yerr : np.ndarray | None , default None Error bars for y values if available. method_name : str, default "" Name of method associated with results, e.g. "openfe". target_name : str, default "" Name of system for results, e.g. "Thrombin". quantity : str, default r"$\Delta\Delta$G" Metric that is being plotted, which will be included in the axis labels. xlabel : str, default "Experimental" Main abel for the x-axis. ylabel : str, default "Calculated" Main label for the y-axis. units : str, default r"$\mathrm{kcal\,mol^{-1}}$" String value of units to label axis with. guidelines : bool | tuple[float, float], default True Controls the shaded guideline bands drawn around the x=y line. - ``True``: draw the default two bands at 0.5 and 1.0 kcal/mol. - ``False``: no guidelines are drawn. - ``tuple`` of up to 2 floats: draw a band for each value, e.g. ``(0.5,)`` for a single band or ``(0.5, 1.0)`` for two bands. When calling the high-level plotting functions (``plot_DDGs``, ``plot_DGs``, ``plot_all_DDGs``) with ``observable_type="pic50"``, the default distances are automatically converted to pIC50 units so the bands remain physically meaningful. Pass an explicit ``guidelines`` tuple to override this. guideline_colors : tuple[str, str], default None Colors for each guideline band, corresponding positionally to each distance in *guidelines*. If ``None``, or fewer colors than bands are provided, the remaining bands default to ``"grey"``. Example: ``("blue", "red")`` colors the inner band blue and the outer band red. origins : bool, default True Toggles plotting of x and y-axis. color : str, default None The name of the colour scheme for the scatter plots. If None, will be coloured according to distance from unity statistics : list[{'RMSE', 'NRMSE', 'MUE', 'RAE', 'R2', 'rho', 'KTAU', 'PI'}] | None, default None List of statistics to calculate and report on the plot, if None "RMSE" and "MUE" will be reported. filename : str | None, default None Filename for plot, if not provided the plot will be displayed. centralizing : bool, default True Offset the free energies ``True`` or report raw values ``False``. shift : float, default 0. Shift both the x and y-axis by a constant. figsize : float, default 3.25 Size of figure for matplotlib. dpi : float | str, default 300 The resolution in dots per inch if 'figure', uses the figure's dpi value (this behavior is copied from https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html) data_labels : list[str] | None, default None List of labels for each data point. axis_padding : float, default 0.5 Padding to add to maximum axis value and subtract from the minimum axis value. xy_lim : tuple[float, float] | None, default None Contains the minimum and maximum values to use for the x and y-axis. if specified, ``axis_padding`` is ignored. font_sizes : dict[str, int] | None, default None Font sizes to use for the title ("title"), the data labels ("labels"), and the rest of the plot ("other"). Defaults to {"title": 12, "labels": 9, "other": 12} bootstrap_x_uncertainty : bool, default False Whether to account for uncertainty in x when bootstrapping. bootstrap_y_uncertainty : bool, default False Whether to account for uncertainty in y when bootstrapping. statistic_type : {"mle", "mean"}, default "mle" The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic) scatter_kwargs : dict[str, float | int | str] | None, default None Arguments to control plt.scatter(), these will override the default cinnabar settings. Returns ------- plt.Figure Raises ------- ValueError If more than 2 values are supplied to the guidelines argument. """ nsamples = len(x) # aesthetics if font_sizes is None: font_sizes = {"title": 12, "labels": 9, "other": 12} plt.rcParams["xtick.labelsize"] = font_sizes["other"] plt.rcParams["ytick.labelsize"] = font_sizes["other"] plt.rcParams["font.size"] = font_sizes["other"] fig = plt.figure(figsize=(figsize, figsize)) plt.subplots_adjust(left=0.2, right=0.8, bottom=0.2, top=0.8) plt.xlabel(f"{xlabel} {quantity} ({units})") plt.ylabel(f"{ylabel} {quantity} ({units})") if xy_lim: ax_min, ax_max = xy_lim scale = xy_lim else: ax_min = min(min(x), min(y)) - axis_padding ax_max = max(max(x), max(y)) + axis_padding scale = (ax_min, ax_max) plt.xlim(scale) plt.ylim(scale) # plots x-axis and y-axis if origins: plt.plot([0, 0], scale, "gray") plt.plot(scale, [0, 0], "gray") # plots x=y line plt.plot(scale, scale, "k:") if guidelines is not False and guidelines is not None and guidelines != (): # get the distances to use if isinstance(guidelines, bool): distances: tuple[float, ...] = (0.5, 1.0) else: distances = tuple(guidelines) if len(distances) > 2: raise ValueError(f"guidelines tuple must contain at most 2 distance values, got {len(distances)}.") # resolve per-band colors resolved_colors = [ (guideline_colors[i] if guideline_colors and i < len(guideline_colors) else "grey") for i in range(len(distances)) ] for dist, band_color in zip(distances, resolved_colors): plt.fill_between( scale, [ax_min - dist, ax_max - dist], [ax_min + dist, ax_max + dist], color=band_color, alpha=0.2, ) # label the upper boundary of the band where it exits through the top # of the axes: the line y = x + dist hits y = ax_max at x = ax_max - dist label_x = ax_max - dist label_y = ax_max if label_x > ax_min: plt.gca().text( label_x, label_y, f{dist:g}", # use :g to strip trailing zeros fontsize=7, ha="right", va="top", rotation=45, color="black", clip_on=True, ) # actual plotting cm = plt.get_cmap("coolwarm") if color is None: _color = np.abs(x - y) # 2.372 kcal / mol = 4 RT _color = cm(_color / 2.372) else: _color = color plt.errorbar( x, y, xerr=xerr, yerr=yerr, color="gray", linewidth=0.0, elinewidth=2.0, zorder=1, ) # add our cinnabar preset settings to the scatter_kwargs to make sure they do not clash # scatter kwargs will override the default settings default_kwargs: dict[str, Any] = { "color": _color, "zorder": 2, "edgecolors": "dimgrey", "linewidths": 0.7, "s": 20, "marker": "o", } if scatter_kwargs is not None: default_kwargs.update(scatter_kwargs) plt.scatter(x, y, **default_kwargs) # Label points if data_labels is not None: texts = [] for i, label in enumerate(data_labels): texts.append(plt.text(x[i] + 0.03, y[i] + 0.03, label, fontsize=font_sizes["labels"])) adjust_text(texts) # stats and title statistics_string = "" if statistic_type not in ["mle", "mean"]: raise ValueError(f"Unknown statistic type {statistic_type}") # set default statistics values if not provided if statistics is None: statistics = ["RMSE", "MUE"] for statistic in statistics: s = stats.bootstrap_statistic( x, y, xerr, yerr, statistic=statistic, include_true_uncertainty=bootstrap_x_uncertainty, include_pred_uncertainty=bootstrap_y_uncertainty, ) string = f"{statistic}: {s[statistic_type]:.2f} [95%: {s['low']:.2f}, {s['high']:.2f}] " + "\n" statistics_string += string long_title = f"{title} \n {target_name} (N = {nsamples}) \n {statistics_string}" plt.title( long_title, fontsize=font_sizes["title"], loc="right", horizontalalignment="right", family="monospace", ) if filename is None: plt.show() else: plt.savefig(filename, bbox_inches="tight", dpi=dpi) return fig
def _master_plot(*args, **kwargs): """Deprecated alias for the pair_plot function. Will be removed in a future release.""" import warnings warnings.warn( "_master_plot is deprecated and will be removed in a future release. Use pair_plot instead.", DeprecationWarning, stacklevel=2, ) return pair_plot(*args, **kwargs)
[docs] def plot_DDGs( femap: FEMap, source: str, method_name: str = "", target_name: str = "", title: str = "", map_positive: bool = False, filename: str | None = None, symmetrise: bool = False, plotly: bool = False, data_label_type: Literal["small-molecule", "protein-mutation"] | None = None, bootstrap_x_uncertainty: bool = False, bootstrap_y_uncertainty: bool = False, statistic_type: Literal["mle", "mean"] = "mle", observable_type: RELATIVE_ANALYSIS_TYPES = "ddg", temperature: Quantity = 298.15 * unit.kelvin, **kwargs, ): """Function to plot relative free energies Parameters ---------- femap : FEMap FEMap object with relative and absolute free energy edges source : str The source label of the computational relative free energies to plot against experiment. This must match the ``source`` field used when the calculations were added to the ``FEMap``. For data loaded via ``FEMap.from_csv`` (or added without an explicit source), pass ``source=""``. method_name : string, default "" Name of method associated with results, e.g. "openfe" by default an empty string. target_name : string, default "" Name of system for results, e.g. "Thrombin". title : string, default "" Title for the plot. map_positive : bool, default False Whether to map all DDGs to the positive x values. This is an aesthetic choice filename : str | None, default None Filename for plot, if None the plot will be displayed. symmetrise : bool, default False Whether to plot each datapoint twice, both positive and negative values. plotly : bool, default False Whether to use plotly to generate the plot. data_label_type : {"small-molecule", "protein-mutation"} | None, default None Type of data label to add to each edge if ``None`` data labels will not be added if ``'small-molecule'`` edge labels will be ``f"{node_A_name}→{node_B_name}"``. if ``'protein-mutation'`` edge labels will given as single letter amino acid codes separated by the mutated residue index (eg. ``"Y29A"``) If both node names start with ``"-"``, the negative sign will be factored out (eg. ``"-(Y29A)"`` or ``"-(benzene→toluene)"``). currently unsupported for plotly-generated plots .. TODO: implement data labeling for the case where plotly=True bootstrap_x_uncertainty : bool, default False Whether to account for uncertainty in x when bootstrapping. bootstrap_y_uncertainty : bool, default False Whether to account for uncertainty in y when bootstrapping. statistic_type : {"mle", "mean"}, default "mle" The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic). observable_type : {"ddg", "dpic50"}, default "ddg" The observable type to plot values in. Defaults to ``ddg`` (kcal/mol). Use ``dpic50`` to plot DpIC50 values. temperature : Quantity, default 298.15 * unit.kelvin Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``. Returns ------- Nothing """ assert int(symmetrise) + int(map_positive) != 2, "Symmetrise and map_positive cannot both be True in the same plot" if data_label_type: assert not plotly, "We currently do not support data labeling for plotly-generated plots" # load the data using the internal dataframes rel_df = femap.get_relative_dataframe(observable_type=observable_type, temperature=temperature) plot_meta = _OBSERVABLE_METADATA[observable_type.lower()] value_col = plot_meta["value_col"] uncertainty_col = plot_meta["uncertainty_col"] comp_mask = rel_df["computational"] all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist() if not all_comp_sources: raise ValueError("The FEMap contains no computational edges.") if source not in all_comp_sources: raise ValueError(f"Source {source} is not a valid source, available sources: {all_comp_sources}") # get the comp data from the computational source comp_data = rel_df[comp_mask & (rel_df["source"] == source)] # get the experimental data exp_data = rel_df[~comp_mask].rename(columns={value_col: "value_exp", uncertainty_col: "uncertainty_exp"}) # merge to align the data and drop any values missing an experimental data point merged = comp_data.merge(exp_data, how="left", on=["labelA", "labelB"]) merged = merged.dropna(subset=["value_exp"]) # extract the required data x = merged["value_exp"].to_numpy(copy=True) y = merged[value_col].to_numpy(copy=True) xerr = merged["uncertainty_exp"].to_numpy(copy=True) yerr = merged[uncertainty_col].to_numpy(copy=True) # labels data_labels: list[str] = [] if data_label_type: for _, row in merged.iterrows(): node_a_name = row["labelA"] node_b_name = row["labelB"] if node_a_name.startswith("-") and node_b_name.startswith("-"): # factor out "-" if both start with it node_a_name = node_a_name[1:] node_b_name = node_b_name[1:] prefix = "-" else: prefix = "" if data_label_type == "small-molecule": data_labels.append(f"{prefix}({node_a_name}{node_b_name})") elif data_label_type == "protein-mutation": data_labels.append(f"{prefix}({node_a_name}{node_b_name[0]})") else: raise ValueError( "data_label_type unsupported. supported types: 'small-molecule' and 'protein-mutation'" ) if symmetrise: x_data = np.append(x, [-i for i in x]) y_data = np.append(y, [-i for i in y]) xerr = np.append(xerr, xerr) yerr = np.append(yerr, yerr) # double the data labels list data_labels = data_labels + data_labels elif map_positive: x_data_map = [] y_data_map = [] for i, j in zip(x, y): if i < 0: x_data_map.append(-i) y_data_map.append(-j) else: x_data_map.append(i) y_data_map.append(j) x_data = np.asarray(x_data_map) y_data = np.asarray(y_data_map) else: x_data = np.asarray(x) y_data = np.asarray(y) if plotly: plotlying._master_plot( x_data, y_data, xerr=xerr, yerr=yerr, filename=filename, plot_type="ΔΔG", title=title, method_name=method_name, target_name=target_name, bootstrap_x_uncertainty=bootstrap_x_uncertainty, bootstrap_y_uncertainty=bootstrap_y_uncertainty, statistic_type=statistic_type, **kwargs, ) else: # allow callers to override quantity/units via kwargs quantity, units, kwargs = _update_plot_kwargs(kwargs, plot_meta, observable_type, temperature) pair_plot( x_data, y_data, xerr=xerr, yerr=yerr, filename=filename, title=title, method_name=method_name, target_name=target_name, quantity=quantity, units=units, data_labels=data_labels, bootstrap_x_uncertainty=bootstrap_x_uncertainty, bootstrap_y_uncertainty=bootstrap_y_uncertainty, statistic_type=statistic_type, **kwargs, )
[docs] def plot_DGs( femap: FEMap, source: str, method_name: str = "", target_name: str = "", title: str = "", filename: str | None = None, plotly: bool = False, centralizing: bool = True, shift: float = 0.0, bootstrap_x_uncertainty: bool = False, bootstrap_y_uncertainty: bool = False, statistic_type: Literal["mle", "mean"] = "mle", observable_type: ABSOLUTE_ANALYSIS_TYPES = "dg", temperature: Quantity = 298.15 * unit.kelvin, **kwargs, ): """Function to plot absolute free energies. Parameters ---------- femap : FEMap FEMap object with absolute free energies to plot. source : str The name of the source label of the computational absolute values, if absolute values are generated with the MLE estimator this should be "MLE". method_name : string, default "" Name of method associated with results, e.g. "openfe" by default an empty string. target_name : string, default "" Name of system for results, e.g. "Thrombin" by default an empty string. title : string, default "" Title for the plot. filename : str | None, default None Filename for plot if None the plot will be displayed. bootstrap_x_uncertainty : bool, default False Whether to account for uncertainty in x when bootstrapping. bootstrap_y_uncertainty : bool, default False Whether to account for uncertainty in y when bootstrapping. statistic_type : {"mle", "mean"}, default "mle" The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic). observable_type : {"dg", "pic50"}, default "dg" The observable type to plot values in. Defaults to ``dg`` (kcal/mol). Use ``pic50`` to plot pIC50 values. temperature : Quantity, default 298.15 * unit.kelvin Temperature used for the unit conversion, only used if observable_type is ``"pic50"``. Returns ------- """ # extract the data from the internal dataframes df = femap.get_absolute_dataframe(observable_type=observable_type, temperature=temperature) plot_meta = _OBSERVABLE_METADATA[observable_type.lower()] value_col = plot_meta["value_col"] uncertainty_col = plot_meta["uncertainty_col"] comp_mask = df["computational"] all_comp_sources = df.loc[comp_mask, "source"].unique().tolist() if not all_comp_sources: raise ValueError( "The FEMap contains no computed absolute values. " "Call generate_absolute_values() first or add calculated absolute measurements directly." ) if source not in all_comp_sources: raise ValueError(f"Source {source} is not a valid source, available sources: {all_comp_sources}") # get the comp data from the computational source comp_data = df[comp_mask & (df["source"] == source)] # get the experimental data exp_data = df[~comp_mask].rename(columns={value_col: "value_exp", uncertainty_col: "uncertainty_exp"}) # merge to align the data and drop any values missing an experimental data point merged = comp_data.merge(exp_data, how="left", on=["label"]) merged = merged.dropna(subset=["value_exp"]) # extract the required data x_data = merged["value_exp"].to_numpy(copy=True) y_data = merged[value_col].to_numpy(copy=True) xerr = merged["uncertainty_exp"].to_numpy(copy=True) yerr = merged[uncertainty_col].to_numpy(copy=True) # centralising # this should be replaced by providing one experimental result if centralizing: x_data = x_data - np.mean(x_data) + shift y_data = y_data - np.mean(y_data) + shift if plotly: plotlying._master_plot( x_data, y_data, xerr=xerr, yerr=yerr, origins=False, statistics=["RMSE", "MUE", "R2", "rho"], plot_type="ΔG", title=title, method_name=method_name, target_name=target_name, filename=filename, bootstrap_x_uncertainty=bootstrap_x_uncertainty, bootstrap_y_uncertainty=bootstrap_y_uncertainty, statistic_type=statistic_type, **kwargs, ) else: quantity, units, kwargs = _update_plot_kwargs(kwargs, plot_meta, observable_type, temperature) pair_plot( x_data, y_data, xerr=xerr, yerr=yerr, origins=False, statistics=["RMSE", "MUE", "R2", "rho"], quantity=quantity, units=units, title=title, method_name=method_name, target_name=target_name, filename=filename, bootstrap_x_uncertainty=bootstrap_x_uncertainty, bootstrap_y_uncertainty=bootstrap_y_uncertainty, statistic_type=statistic_type, **kwargs, )
[docs] def plot_all_DDGs( femap: FEMap, source: str, method_name: str = "", target_name: str = "", title: str = "", filename: str | None = None, plotly: bool = False, shift: float = 0.0, bootstrap_x_uncertainty: bool = False, bootstrap_y_uncertainty: bool = False, statistic_type: Literal["mle", "mean"] = "mle", observable_type: RELATIVE_ANALYSIS_TYPES = "ddg", temperature: Quantity = 298.15 * unit.kelvin, **kwargs, ): """Plots relative free energies between all ligands, which is calculated from the differences between all the absolute free energies. This data is different to ``plot_DGs``. Parameters ---------- femap : FEMap FEMap object with absolute free energies to plot. source : str The name of the source label of the computational absolute values, if absolute values are generated with the MLE estimator this should be "MLE". method_name : string, default "" Name of method associated with results, e.g. "openfe" by default an empty string. target_name : string, default "" Name of system for results, e.g. "Thrombin" by default an empty string. title : string, default "" Title for the plot. filename : str | None, default None Filename for plot if None the plot will be displayed. plotly : bool, default True Whether to use plotly for the plotting. shift : float, default 0. Shift both the x and y-axis by a constant. bootstrap_x_uncertainty : bool, default False Whether to account for uncertainty in x when bootstrapping. bootstrap_y_uncertainty : bool, default False Whether to account for uncertainty in y when bootstrapping. statistic_type : {"mle", "mean"}, default "mle" The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic). observable_type : {"ddg", "dpic50"}, default "ddg" The observable type to plot values in. Defaults to ``ddg`` (kcal/mol). Use ``dpic50`` to plot DpIC50 values. temperature : Quantity, default 298.15 * unit.kelvin Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``. Returns ------- """ # use the internal dataframes to get the pairwise differences rel_df = femap.get_all_to_all_relative_dataframe( symmetrical=True, observable_type=observable_type, temperature=temperature ) plot_meta = _OBSERVABLE_METADATA[observable_type.lower()] value_col = plot_meta["value_col"] uncertainty_col = plot_meta["uncertainty_col"] comp_mask = rel_df["computational"] all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist() if not all_comp_sources: raise ValueError( "The FEMap contains no computed absolute values which are needed to obtain the all-to-all pairwise DDGs. " "Call generate_absolute_values() first or add calculated absolute measurements directly." ) if source not in all_comp_sources: raise ValueError(f"Source {source} is not a valid source, available sources: {all_comp_sources}") # get the comp data from the computational source comp_data = rel_df[comp_mask & (rel_df["source"] == source)] # get the experimental data exp_data = rel_df[~comp_mask].rename(columns={value_col: "value_exp", uncertainty_col: "uncertainty_exp"}) # merge to align the data and drop any values missing an experimental data point merged = comp_data.merge(exp_data, how="left", on=["labelA", "labelB"]) merged = merged.dropna(subset=["value_exp"]) # extract the required data x_data = merged["value_exp"].to_numpy(copy=True) y_data = merged[value_col].to_numpy(copy=True) xerr = merged["uncertainty_exp"].to_numpy(copy=True) yerr = merged[uncertainty_col].to_numpy(copy=True) if plotly: plotlying._master_plot( x_data, y_data, xerr=xerr, yerr=yerr, title=title, method_name=method_name, plot_type="ΔΔG", filename=filename, target_name=target_name, bootstrap_x_uncertainty=bootstrap_x_uncertainty, bootstrap_y_uncertainty=bootstrap_y_uncertainty, statistic_type=statistic_type, **kwargs, ) else: quantity, units, kwargs = _update_plot_kwargs(kwargs, plot_meta, observable_type, temperature) pair_plot( x_data, y_data, xerr=xerr, yerr=yerr, title=title, method_name=method_name, filename=filename, target_name=target_name, quantity=quantity, units=units, shift=shift, bootstrap_x_uncertainty=bootstrap_x_uncertainty, bootstrap_y_uncertainty=bootstrap_y_uncertainty, statistic_type=statistic_type, **kwargs, )
[docs] def ecdf_plot( datasets: dict[str, np.ndarray], title: str | None = "ECDF of Absolute Errors", xlabel: str = "Edgewise", quantity: str = r"$|\Delta\Delta$G$_{calc} - \Delta\Delta$G$_{exp}|$", units: str = r"$\mathrm{kcal\,mol^{-1}}$", ylabel: str = "Cumulative Probability", figsize: float | tuple[float, float] = 4, colors: list[str] | None = None, ecdf_kwargs: dict[str, Any] | None = None, filename: str | None = None, nbootstraps: int = 1_000, ci: float = 0.95, ) -> plt.Figure: r""" Plot ECDFs for one or more datasets. Where the dataset is a flat array of absolute errors. Parameters ---------- datasets : dict[str, np.ndarray] A dictionary where keys are dataset labels and values are the data arrays. title: str | None, default "ECDF of Absolute Errors" Title for the plot. If None, no title is set. xlabel : str, default "Edgewise" Label for the x-axis. quantity : str, default r"$\Delta\Delta$G" Metric that is being plotted. units : str, default r"$\mathrm{kcal\,mol^{-1}}$" Units of the metric being plotted. ylabel : str, default "Cumulative Probability" Label for the y-axis. figsize : float | tuple[float, float], default 4 Size of the figure. colors : list[str] | None, default None List of colors for each dataset. If None, default colors are used. ecdf_kwargs : dict, default None Additional keyword arguments to pass to seaborn.ecdfplot. filename : str | None, default None If provided, the plot will be saved to this filename. nbootstraps : int, default = 1_000 Number of bootstraps to perform for estimating confidence intervals. ci : float, default = 0.95 Confidence level for the confidence intervals (e.g., 0.95 for 95% confidence intervals). Returns ------- plt.Figure The matplotlib Figure object containing the ECDF plot which can be edited further. """ if ecdf_kwargs is None: ecdf_kwargs = {} if not datasets: raise ValueError("At least one dataset is required to plot an ECDF.") # make sure 0 < ci < 1 if not 0 < ci < 1: raise ValueError("ci must be between 0 and 1.") if not isinstance(figsize, tuple): figsize = (figsize, figsize) fig, axs = plt.subplots(figsize=figsize) # make the default ecdf_kwargs for the plot default_kwargs = { "ax": axs, "linewidth": 2, } default_kwargs.update(ecdf_kwargs) if colors is None: # get the default colors colors = sns.color_palette(None, n_colors=len(datasets)) # Iterate over the dictionary to plot ECDFs for i, (label, data) in enumerate(datasets.items()): # Pick a color for the dataset if specified color = colors[i] default_kwargs["color"] = color # we first need to sort the data so its in the same order if CIs are plotted data = np.sort(data) sns.ecdfplot(data, label=label, **default_kwargs) # estimate an error bounds via bootsrapping over the data if the number of bootstraps is > 0 if nbootstraps > 0: boot_ecdfs = [] for _ in range(nbootstraps): sample = np.random.choice(data, size=len(data), replace=True) # calculate the ECDF for this sample at each point in the true sample data # searchsorted returns an array of indices in sample where the values in data would fit in # by dividing by the length of the sample we get the fraction of sample points that are less than or # equal to each point in the data, which is the ECDF value at that point boot_ecdf = np.searchsorted(np.sort(sample), data, side="right") / len(sample) boot_ecdfs.append(boot_ecdf) # calculate the confidence interval based on the user input low_percentile = (1.0 - ci) / 2.0 * 100 high_percentile = 100 - low_percentile lower = np.percentile(boot_ecdfs, low_percentile, axis=0) upper = np.percentile(boot_ecdfs, high_percentile, axis=0) # now plot a shaded region between the confidence intervals plt.fill_between(data, lower, upper, alpha=0.2, color=color) if title is not None: plt.title(title, fontsize=14) plt.xlabel(f"{xlabel} {quantity} ({units})", fontsize=12) plt.ylabel(ylabel, fontsize=12) plt.xlim(left=0) plt.legend() # add gridlines to help identify 1, 2 kcal/mol errors plt.grid(alpha=0.3) plt.tight_layout() if filename is not None: plt.savefig(filename, bbox_inches="tight", dpi=300) return fig
[docs] def ecdf_plot_DDGs( femap: FEMap, sources: list[str] | None = None, labels: list[str] | None = None, title: str | None = "ECDF of Edgewise Absolute Errors", filename: str | None = None, observable_type: RELATIVE_ANALYSIS_TYPES = "ddg", temperature: Quantity = 298.15 * unit.kelvin, **kwargs, ) -> plt.Figure: """ Plot ECDF of absolute errors for edgewise relative free energies from multiple sources within a single FEMap. Each computational source in the FEMap (i.e. each distinct ``source`` tag on the relative calculations) is plotted as a separate ECDF curve. Parameters ---------- femap : FEMap A single FEMap containing one or more computational sources together with experimental absolute measurements. sources : list[str] | None, default None Computational source names to include in the plot. If ``None``, all computational sources are used. labels : list[str] | None, default None Display labels for the legend, one per source. If ``None``, the source names are used directly, this should be used with ``sources`` to ensure the correct label is applied to the corresponding source. title : str | None, default "ECDF of Edgewise Absolute Errors" Title for the plot. If ``None``, no title is set. filename : str | None, default None If provided, the plot will be saved to this filename. observable_type : {"ddg", "dpic50"}, default "ddg" The observable type to plot values in. Defaults to ``ddg`` (kcal/mol). Use ``dpic50`` to report DpIC50 values. temperature : Quantity, default 298.15 * unit.kelvin Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``. **kwargs Additional keyword arguments to pass to `ecdf_plot`. Returns ------- plt.Figure The matplotlib Figure object containing the ECDF plot which can be edited further. Notes ----- If any edges are missing an experimental value, they will be skipped in the absolute error calculation. Raises ------ ValueError If any edges are missing a calculated DDG value, if the number of sources and labels do not match or if no computational results are in the graph. """ rel_df = femap.get_relative_dataframe(observable_type=observable_type, temperature=temperature) plot_meta = _OBSERVABLE_METADATA[observable_type.lower()] value_col = plot_meta["value_col"] comp_mask = rel_df["computational"] all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist() if not all_comp_sources: raise ValueError("The FEMap contains no computational edges.") if sources is None: sources = all_comp_sources if labels is None: labels = list(sources) if len(sources) != len(labels): raise ValueError(f"`sources` and `labels` must have the same length, got {len(sources)} and {len(labels)}.") # get the experimental data exp_df = rel_df[~comp_mask].set_index(["labelA", "labelB"])[[value_col]].rename(columns={value_col: "value_exp"}) datasets = {} for source, label in zip(sources, labels): src_df = rel_df[comp_mask & (rel_df["source"] == source)].set_index(["labelA", "labelB"])[[value_col]] if src_df.empty: raise ValueError(f"No computational edges found for source '{source}'.") merged = src_df.join(exp_df, how="left") # skip edges without an experimental reference DDG merged = merged.dropna(subset=["value_exp"]) datasets[label] = np.abs(merged[value_col] - merged["value_exp"]).values # finally check all datasets are the same length if len({len(v) for v in datasets.values()}) != 1: raise ValueError( "Inconsistent number of computational edges across sources, make sure all edges have a result for each source." ) # allow callers to override quantity/units via kwargs quantity = kwargs.pop("quantity", plot_meta["ecdf_quantity"]) units = kwargs.pop("units", plot_meta["units"]) return ecdf_plot( datasets, title=title, filename=filename, xlabel="Edgewise", quantity=quantity, units=units, **kwargs, )
[docs] def ecdf_plot_DGs( femap: FEMap, sources: list[str] | None = None, labels: list[str] | None = None, title: str | None = "ECDF of Nodewise Absolute Errors", filename: str | None = None, centralizing: bool = True, observable_type: ABSOLUTE_ANALYSIS_TYPES = "dg", temperature: Quantity = 298.15 * unit.kelvin, **kwargs, ) -> plt.Figure: """ Plot ECDF of absolute errors for nodewise absolute free energies from multiple sources within a single FEMap. Each computational source in the FEMap is plotted as a separate ECDF curve. Parameters ---------- femap : FEMap A single FEMap containing one or more computational absolute sources together with experimental absolute measurements. sources : list[str] | None, default None Computational source names to include in the plot. If ``None``, all computational sources are used. labels : list[str] | None, default None Display labels for the legend, one per source. If ``None``, the source names are used directly, this should be used with ``sources`` to ensure the correct label is applied to the corresponding source. title : str | None, default "ECDF of Nodewise Absolute Errors" Title for the plot. filename : str | None, default None If provided, the plot will be saved to this filename. centralizing : bool, default True whether to center both calculated and experimental values around zero before calculating absolute errors. observable_type : {"dg", "pic50"}, default "dg" The observable type to plot values in. Defaults to ``dg`` (kcal/mol). Use ``pic50`` to report pIC50 values. temperature : Quantity, default 298.15 * unit.kelvin Temperature used for the unit conversion, only used if observable_type is ``"pic50"``. **kwargs Additional keyword arguments to pass to `ecdf_plot`. Returns ------- plt.Figure The matplotlib Figure object containing the ECDF plot which can be edited further. Raises ------ ValueError If any ligands are missing a calculated DG value, if the number of sources and labels do not match or if no computational results are in the graph. """ df = femap.get_absolute_dataframe(observable_type=observable_type, temperature=temperature) plot_meta = _OBSERVABLE_METADATA[observable_type.lower()] value_col = plot_meta["value_col"] comp_mask = df["computational"] all_comp_sources = df.loc[comp_mask, "source"].unique().tolist() if not all_comp_sources: raise ValueError( "The FEMap contains no computed absolute values. " "Call generate_absolute_values() first or add calculated absolute measurements directly." ) if sources is None: sources = all_comp_sources if labels is None: labels = list(sources) if len(sources) != len(labels): raise ValueError(f"`sources` and `labels` must have the same length, got {len(sources)} and {len(labels)}.") exp_df = ( df[~comp_mask] .drop_duplicates(subset=["label"]) .set_index("label")[[value_col]] .rename(columns={value_col: "value_exp"}) ) datasets = {} for source, label in zip(sources, labels): src_df = df[comp_mask & (df["source"] == source)].set_index("label")[[value_col]] if src_df.empty: raise ValueError(f"No computed absolute values found for source '{source}'.") merged = src_df.join(exp_df, how="left") merged = merged.dropna(subset=["value_exp"]) x = merged["value_exp"].values y = merged[value_col].values if centralizing: x = x - x.mean() y = y - y.mean() datasets[label] = np.abs(y - x) if len({len(v) for v in datasets.values()}) != 1: raise ValueError( "Inconsistent number of computational nodes across sources, make sure all nodes have a result for each source." ) # allow callers to override quantity/units via kwargs quantity = kwargs.pop("quantity", plot_meta["ecdf_quantity"]) units = kwargs.pop("units", plot_meta["units"]) return ecdf_plot( datasets, title=title, xlabel="Nodewise", quantity=quantity, units=units, filename=filename, **kwargs, )
[docs] def ecdf_plot_all_DDGs( femap: FEMap, sources: list[str] | None = None, labels: list[str] | None = None, title: str | None = "ECDF of Pairwise (all-to-all) Absolute Errors", filename: str | None = None, observable_type: RELATIVE_ANALYSIS_TYPES = "ddg", temperature: Quantity = 298.15 * unit.kelvin, **kwargs, ) -> plt.Figure: """ Plot ECDF of absolute errors for all-to-all relative free energies calculated from absolute free energies in a graph. Parameters ---------- femap : FEMap A single FEMap containing one or more computational absolute sources together with experimental absolute measurements. sources : list[str] | None, default None Computational source names to include. If ``None``, all computational sources in the absolute dataframe are used. labels : list[str] | None, default None Display labels for the legend. Defaults to the source names. title : str | None, default "ECDF of Pairwise (all-to-all) Absolute Errors" Title for the plot. filename : str | None, default None If provided, the plot will be saved to this filename. observable_type : {"ddg", "dpic50"}, default "ddg" The observable type to plot values in. Defaults to ``ddg`` (kcal/mol). Use ``dpic50`` to report DpIC50 values. temperature : Quantity, default 298.15 * unit.kelvin Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``. **kwargs Additional keyword arguments to pass to `ecdf_plot`. Returns ------- plt.Figure The matplotlib Figure object containing the ECDF plot which can be edited further. Raises ------ ValueError If the FEMap contains no computed absolute values, if a requested source cannot be found, or if ``sources`` and ``labels`` differ in length. Notes ----- Ligands without experimental absolute values are excluded before computing pairwise combinations. Only unique (unordered) pairs are included, so N ligands contribute N*(N-1)/2 data points. """ rel_df = femap.get_all_to_all_relative_dataframe( symmetrical=False, observable_type=observable_type, temperature=temperature ) plot_meta = _OBSERVABLE_METADATA[observable_type.lower()] value_col = plot_meta["value_col"] comp_mask = rel_df["computational"] all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist() if not all_comp_sources: raise ValueError( "The FEMap contains no computed absolute values which are need to obtain the all-to-all pairwise DDGs. " "Call generate_absolute_values() first or add calculated absolute measurements directly." ) if sources is None: sources = all_comp_sources if labels is None: labels = list(sources) if len(sources) != len(labels): raise ValueError(f"`sources` and `labels` must have the same length, got {len(sources)} and {len(labels)}.") exp_df = rel_df[~comp_mask].set_index(["labelA", "labelB"])[[value_col]].rename(columns={value_col: "value_exp"}) datasets = {} for source, label in zip(sources, labels): src_df = rel_df[comp_mask & (rel_df["source"] == source)].set_index(["labelA", "labelB"])[[value_col]] if src_df.empty: raise ValueError(f"No computational edges found for source '{source}'.") merged = src_df.join(exp_df, how="left") # skip edges without an experimental reference DDG merged = merged.dropna(subset=["value_exp"]) datasets[label] = np.abs(merged[value_col] - merged["value_exp"]).values # finally check all datasets are the same length if len({len(v) for v in datasets.values()}) != 1: raise ValueError( "Inconsistent number of computational edges across sources, make sure all edges have a result for each source." ) # allow callers to override quantity/units via kwargs quantity = kwargs.pop("quantity", plot_meta["ecdf_quantity"]) units = kwargs.pop("units", plot_meta["units"]) return ecdf_plot( datasets, title=title, xlabel="Pairwise", quantity=quantity, units=units, filename=filename, **kwargs, )
[docs] def plot_cycle_closure( fe_map: FEMap, filename: str | None, max_cycle_length: int = 5, sources: list[str] | None = None, bin_width: float = 0.5, ) -> plt.Figure: """ Plot a histogram of cycle closure errors, taking the ``cc_per_edge (kcal/mol)`` which is the cycle closure divided by the square root of the cycle length. Parameters ---------- fe_map : FEMap FEMap object containing the calculated edges. filename : str | None, default None If provided, the plot will be saved to this filename. max_cycle_length : int, default 5 Only consider cycles up to this length. Defaults to 5. sources : list[str] | None, default None List of sources to plot. If None, all sources are plotted. bin_width : float, default 0.5 Width of histogram bins in kcal/mol. Default: 0.5 Returns ------- plt.Figure The matplotlib Figure object containing the histogram which can be edited further. Raises ------ ValueError If the FEMap contains no cycles, or if a requested source cannot be found. """ df = fe_map.get_cycle_closure_dataframe(max_cycle_length=max_cycle_length) if df.empty: raise ValueError("The FEMap does not contain cycles.") if sources is not None: df = df[df["source"].isin(sources)] if df.empty: raise ValueError(f"No cycles found for sources {sources}.") unique_sources = df["source"].unique() fig, ax = plt.subplots(figsize=(5, 4)) max_val = df["cc_per_edge (kcal/mol)"].max() bins = np.arange(0, max_val + bin_width, bin_width).tolist() for source in unique_sources: source_df = df[df["source"] == source] ax.hist(source_df["cc_per_edge (kcal/mol)"], bins=bins, alpha=0.6, label=source) ax.set_xlabel(r"Cycle closure per edge (kcal mol$^{-1}$)") ax.set_ylabel("Count") ax.set_title("Cycle closure distribution") ax.legend() fig.tight_layout() if filename is None: plt.show() else: fig.savefig(filename, bbox_inches="tight", dpi=300) return fig