Source code for cinnabar.plotting

import itertools
from typing import Any, Literal

import matplotlib.pylab as plt
import networkx as nx
import numpy as np
import seaborn as sns
from adjustText import adjust_text
from openff.units import Quantity, unit

from cinnabar import conversion, plotlying, stats
from cinnabar.femap import ABSOLUTE_ANALYSIS_TYPES, RELATIVE_ANALYSIS_TYPES, FEMap

# Define the default guidelines in kcal/mol for the scatter plots
_DEFAULT_GUIDELINES_DG = (0.5, 1.0)


def _convert_guidelines_to_pic50(guidelines: tuple[float, float], temperature: Quantity) -> tuple[float, ...]:
    """
    Convert the given guidelines from kcal/mol to pIC50 values at the given temperature.

    Parameters
    ----------
    guidelines : tuple[float, float]
        The guideline distances in kcal/mol to convert.
    temperature : Quantity
        The temperature at which to perform the conversion.
    """
    converted: list[float] = []
    for g in guidelines:
        dg = Quantity(g, units=unit.kilocalorie_per_mole)
        pic50, _ = conversion.convert_observable(dg, original_type="dg", final_type="pic50", temperature=temperature)
        converted.append(abs(pic50.m))  # type: ignore[arg-type]
    return tuple(converted)


def _update_plot_kwargs(
    kwargs: dict,
    plot_meta: dict[str, str],
    observable_type: str,
    temperature: Quantity,
) -> tuple[str, str, dict]:
    """Extract and resolve quantity, units, and guidelines from kwargs for pair_plot calls."""
    quantity = kwargs.pop("quantity", plot_meta["quantity"])
    units = kwargs.pop("units", plot_meta["units"])
    if kwargs.get("guidelines", True) is True and observable_type.lower().endswith("pic50"):
        kwargs["guidelines"] = _convert_guidelines_to_pic50(guidelines=_DEFAULT_GUIDELINES_DG, temperature=temperature)
    return quantity, units, kwargs


# Map the observable type to expected dataframe column names and the units for the plot
_OBSERVABLE_METADATA: dict[str, dict[str, str]] = {
    "ddg": {
        "value_col": "DDG (kcal/mol)",
        "uncertainty_col": "uncertainty (kcal/mol)",
        "quantity": r"$\Delta\Delta$G",
        "units": r"$\mathrm{kcal\,mol^{-1}}$",
        "ecdf_quantity": r"$|\Delta\Delta$G$_{calc} - \Delta\Delta$G$_{exp}|$",
    },
    "dg": {
        "value_col": "DG (kcal/mol)",
        "uncertainty_col": "uncertainty (kcal/mol)",
        "quantity": r"$\Delta$G",
        "units": r"$\mathrm{kcal\,mol^{-1}}$",
        "ecdf_quantity": r"$|\Delta$G$_{calc} - \Delta$G$_{exp}|$",
    },
    "dpic50": {
        "value_col": "DpIC50",
        "uncertainty_col": "uncertainty (unitless)",
        "quantity": r"$\Delta$pIC50",
        "units": "unitless",
        "ecdf_quantity": r"$|\Delta$pIC50$_{calc}$ - $\Delta$pIC50$_{exp}|$",
    },
    "pic50": {
        "value_col": "pIC50",
        "uncertainty_col": "uncertainty (unitless)",
        "quantity": "pIC50",
        "units": "unitless",
        "ecdf_quantity": r"$|$pIC50$_{calc}$ - pIC50$_{exp}|$",
    },
}



[docs]
def pair_plot(
    x: np.ndarray,
    y: np.ndarray,
    title: str = "",
    xerr: np.ndarray | None = None,
    yerr: np.ndarray | None = None,
    method_name: str = "",
    target_name: str = "",
    quantity: str = r"$\Delta\Delta$G",
    xlabel: str = "Experimental",
    ylabel: str = "Calculated",
    units: str = r"$\mathrm{kcal\,mol^{-1}}$",
    guidelines: bool | tuple[float, float] = True,
    guideline_colors: tuple[str, str] | None = None,
    origins: bool = True,
    color: str | None = None,
    statistics: list[Literal["RMSE", "NRMSE", "MUE", "RAE", "R2", "rho", "KTAU", "PI"]] | None = None,
    filename: str | None = None,
    centralizing: bool = True,
    shift: float = 0.0,
    figsize: float = 3.25,
    dpi: float | str = 300,
    data_labels: list[str] | None = None,
    axis_padding: float = 0.5,
    xy_lim: tuple[float, float] | None = None,
    font_sizes: dict[str, int] | None = None,
    bootstrap_x_uncertainty: bool = False,
    bootstrap_y_uncertainty: bool = False,
    statistic_type: Literal["mle", "mean"] = "mle",
    scatter_kwargs: dict[str, float | int | str] | None = None,
):
    r"""Handles the aesthetics of the pair plots in one place.

    Parameters
    ----------
    x : np.ndarray
        Values to plot on the x-axis.
    y : np.ndarray
        Values to plot on the y-axis
    title : str, default ""
        Title for the plot.
    xerr : np.ndarray | None , default None
        Error bars for x values if available.
    yerr : np.ndarray | None , default None
        Error bars for y values if available.
    method_name : str, default ""
        Name of method associated with results, e.g. "openfe".
    target_name : str, default ""
        Name of system for results, e.g. "Thrombin".
    quantity : str, default r"$\Delta\Delta$G"
        Metric that is being plotted, which will be included in the axis labels.
    xlabel : str, default "Experimental"
        Main abel for the x-axis.
    ylabel : str, default "Calculated"
        Main label for the y-axis.
    units : str, default r"$\mathrm{kcal\,mol^{-1}}$"
        String value of units to label axis with.
    guidelines : bool | tuple[float, float], default True
        Controls the shaded guideline bands drawn around the x=y line.

        - ``True``: draw the default two bands at 0.5 and 1.0 kcal/mol.
        - ``False``: no guidelines are drawn.
        - ``tuple`` of up to 2 floats: draw a band for each value, e.g.
          ``(0.5,)`` for a single band or ``(0.5, 1.0)`` for two bands.

        When calling the high-level plotting functions (``plot_DDGs``,
        ``plot_DGs``, ``plot_all_DDGs``) with ``observable_type="pic50"``,
        the default distances are automatically converted to pIC50 units
        so the bands remain physically meaningful.  Pass an explicit ``guidelines`` tuple to override this.
    guideline_colors : tuple[str, str], default None
        Colors for each guideline band, corresponding positionally to each
        distance in *guidelines*. If ``None``, or fewer colors than bands are
        provided, the remaining bands default to ``"grey"``.
        Example: ``("blue", "red")`` colors the inner band blue and the outer
        band red.
    origins : bool, default True
        Toggles plotting of x and y-axis.
    color : str, default None
        The name of the colour scheme for the scatter plots. If None, will be coloured according to distance from unity
    statistics : list[{'RMSE', 'NRMSE', 'MUE', 'RAE', 'R2', 'rho', 'KTAU', 'PI'}] | None, default None
        List of statistics to calculate and report on the plot, if None "RMSE" and "MUE" will be reported.
    filename : str | None, default None
        Filename for plot, if not provided the plot will be displayed.
    centralizing : bool, default True
        Offset the free energies ``True`` or report raw values ``False``.
    shift : float, default 0.
        Shift both the x and y-axis by a constant.
    figsize : float, default 3.25
        Size of figure for matplotlib.
    dpi : float | str, default 300
        The resolution in dots per inch
        if 'figure', uses the figure's dpi value (this behavior is copied from
        https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html)
    data_labels : list[str] | None, default None
        List of labels for each data point.
    axis_padding : float, default 0.5
        Padding to add to maximum axis value and subtract from the minimum axis value.
    xy_lim : tuple[float, float] | None, default None
        Contains the minimum and maximum values to use for the x and y-axis. if specified, ``axis_padding`` is ignored.
    font_sizes : dict[str, int] | None, default None
        Font sizes to use for the title ("title"), the data labels ("labels"), and the rest of the plot ("other").
        Defaults to {"title": 12, "labels": 9, "other": 12}
    bootstrap_x_uncertainty : bool, default False
        Whether to account for uncertainty in x when bootstrapping.
    bootstrap_y_uncertainty : bool, default False
        Whether to account for uncertainty in y when bootstrapping.
    statistic_type : {"mle", "mean"}, default "mle"
        The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic)
    scatter_kwargs : dict[str, float | int | str] | None, default None
        Arguments to control plt.scatter(), these will override the default cinnabar settings.

    Returns
    -------
    plt.Figure

    Raises
    -------
    ValueError
        If more than 2 values are supplied to the guidelines argument.

    """
    nsamples = len(x)

    # aesthetics
    if font_sizes is None:
        font_sizes = {"title": 12, "labels": 9, "other": 12}

    plt.rcParams["xtick.labelsize"] = font_sizes["other"]
    plt.rcParams["ytick.labelsize"] = font_sizes["other"]
    plt.rcParams["font.size"] = font_sizes["other"]

    fig = plt.figure(figsize=(figsize, figsize))
    plt.subplots_adjust(left=0.2, right=0.8, bottom=0.2, top=0.8)

    plt.xlabel(f"{xlabel} {quantity} ({units})")
    plt.ylabel(f"{ylabel} {quantity} ({units})")

    if xy_lim:
        ax_min, ax_max = xy_lim
        scale = xy_lim
    else:
        ax_min = min(min(x), min(y)) - axis_padding
        ax_max = max(max(x), max(y)) + axis_padding
        scale = (ax_min, ax_max)

    plt.xlim(scale)
    plt.ylim(scale)

    # plots x-axis and y-axis
    if origins:
        plt.plot([0, 0], scale, "gray")
        plt.plot(scale, [0, 0], "gray")

    # plots x=y line
    plt.plot(scale, scale, "k:")
    if guidelines is not False and guidelines is not None and guidelines != ():
        # get the distances to use
        if isinstance(guidelines, bool):
            distances: tuple[float, ...] = (0.5, 1.0)
        else:
            distances = tuple(guidelines)
            if len(distances) > 2:
                raise ValueError(f"guidelines tuple must contain at most 2 distance values, got {len(distances)}.")
        # resolve per-band colors
        resolved_colors = [
            (guideline_colors[i] if guideline_colors and i < len(guideline_colors) else "grey")
            for i in range(len(distances))
        ]
        for dist, band_color in zip(distances, resolved_colors):
            plt.fill_between(
                scale,
                [ax_min - dist, ax_max - dist],
                [ax_min + dist, ax_max + dist],
                color=band_color,
                alpha=0.2,
            )
            # label the upper boundary of the band where it exits through the top
            # of the axes: the line y = x + dist hits y = ax_max at x = ax_max - dist
            label_x = ax_max - dist
            label_y = ax_max
            if label_x > ax_min:
                plt.gca().text(
                    label_x,
                    label_y,
                    f"±{dist:g}",  # use :g to strip trailing zeros
                    fontsize=7,
                    ha="right",
                    va="top",
                    rotation=45,
                    color="black",
                    clip_on=True,
                )
    # actual plotting
    cm = plt.get_cmap("coolwarm")

    if color is None:
        _color = np.abs(x - y)
        # 2.372 kcal / mol = 4 RT
        _color = cm(_color / 2.372)
    else:
        _color = color

    plt.errorbar(
        x,
        y,
        xerr=xerr,
        yerr=yerr,
        color="gray",
        linewidth=0.0,
        elinewidth=2.0,
        zorder=1,
    )
    # add our cinnabar preset settings to the scatter_kwargs to make sure they do not clash
    # scatter kwargs will override the default settings
    default_kwargs: dict[str, Any] = {
        "color": _color,
        "zorder": 2,
        "edgecolors": "dimgrey",
        "linewidths": 0.7,
        "s": 20,
        "marker": "o",
    }
    if scatter_kwargs is not None:
        default_kwargs.update(scatter_kwargs)
    plt.scatter(x, y, **default_kwargs)

    # Label points
    if data_labels is not None:
        texts = []
        for i, label in enumerate(data_labels):
            texts.append(plt.text(x[i] + 0.03, y[i] + 0.03, label, fontsize=font_sizes["labels"]))
        adjust_text(texts)

    # stats and title
    statistics_string = ""
    if statistic_type not in ["mle", "mean"]:
        raise ValueError(f"Unknown statistic type {statistic_type}")

    # set default statistics values if not provided
    if statistics is None:
        statistics = ["RMSE", "MUE"]

    for statistic in statistics:
        s = stats.bootstrap_statistic(
            x,
            y,
            xerr,
            yerr,
            statistic=statistic,
            include_true_uncertainty=bootstrap_x_uncertainty,
            include_pred_uncertainty=bootstrap_y_uncertainty,
        )
        string = f"{statistic}:   {s[statistic_type]:.2f} [95%: {s['low']:.2f}, {s['high']:.2f}] " + "\n"
        statistics_string += string

    long_title = f"{title} \n {target_name} (N = {nsamples}) \n {statistics_string}"

    plt.title(
        long_title,
        fontsize=font_sizes["title"],
        loc="right",
        horizontalalignment="right",
        family="monospace",
    )

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename, bbox_inches="tight", dpi=dpi)
    return fig



def _master_plot(*args, **kwargs):
    """Deprecated alias for the pair_plot function. Will be removed in a future release."""
    import warnings

    warnings.warn(
        "_master_plot is deprecated and will be removed in a future release. Use pair_plot instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    return pair_plot(*args, **kwargs)



[docs]
def plot_DDGs(
    femap: FEMap,
    source: str,
    method_name: str = "",
    target_name: str = "",
    title: str = "",
    map_positive: bool = False,
    filename: str | None = None,
    symmetrise: bool = False,
    plotly: bool = False,
    data_label_type: Literal["small-molecule", "protein-mutation"] | None = None,
    bootstrap_x_uncertainty: bool = False,
    bootstrap_y_uncertainty: bool = False,
    statistic_type: Literal["mle", "mean"] = "mle",
    observable_type: RELATIVE_ANALYSIS_TYPES = "ddg",
    temperature: Quantity = 298.15 * unit.kelvin,
    **kwargs,
):
    """Function to plot relative free energies

    Parameters
    ----------
    femap : FEMap
        FEMap object with relative and absolute free energy edges
    source : str
        The source label of the computational relative free energies to plot against experiment.
        This must match the ``source`` field used when the calculations were added to the
        ``FEMap``.  For data loaded via ``FEMap.from_csv``
        (or added without an explicit source), pass ``source=""``.
    method_name : string, default ""
        Name of method associated with results, e.g. "openfe" by default an empty string.
    target_name : string, default ""
        Name of system for results, e.g. "Thrombin".
    title : string, default ""
        Title for the plot.
    map_positive : bool, default False
        Whether to map all DDGs to the positive x values.
        This is an aesthetic choice
    filename : str | None, default None
        Filename for plot, if None the plot will be displayed.
    symmetrise : bool, default False
        Whether to plot each datapoint twice, both positive and negative values.
    plotly : bool, default False
        Whether to use plotly to generate the plot.
    data_label_type : {"small-molecule", "protein-mutation"} | None, default None
        Type of data label to add to each edge

        if ``None`` data labels will not be added

        if ``'small-molecule'`` edge labels will be ``f"{node_A_name}→{node_B_name}"``.

        if ``'protein-mutation'`` edge labels will given as single letter amino
        acid codes separated by the mutated residue index (eg. ``"Y29A"``)

        If both node names start with ``"-"``, the negative sign will be factored
        out (eg. ``"-(Y29A)"`` or ``"-(benzene→toluene)"``).

        currently unsupported for plotly-generated plots

        .. TODO: implement data labeling for the case where plotly=True

    bootstrap_x_uncertainty : bool, default False
        Whether to account for uncertainty in x when bootstrapping.
    bootstrap_y_uncertainty : bool, default False
        Whether to account for uncertainty in y when bootstrapping.
    statistic_type : {"mle", "mean"}, default "mle"
        The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic).
    observable_type : {"ddg", "dpic50"}, default "ddg"
        The observable type to plot values in.  Defaults to ``ddg`` (kcal/mol).
        Use ``dpic50`` to plot DpIC50 values.
    temperature : Quantity, default 298.15 * unit.kelvin
        Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``.

    Returns
    -------
    Nothing
    """

    assert int(symmetrise) + int(map_positive) != 2, "Symmetrise and map_positive cannot both be True in the same plot"

    if data_label_type:
        assert not plotly, "We currently do not support data labeling for plotly-generated plots"

    # load the data using the internal dataframes
    rel_df = femap.get_relative_dataframe(observable_type=observable_type, temperature=temperature)
    plot_meta = _OBSERVABLE_METADATA[observable_type.lower()]
    value_col = plot_meta["value_col"]
    uncertainty_col = plot_meta["uncertainty_col"]
    comp_mask = rel_df["computational"]
    all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist()

    if not all_comp_sources:
        raise ValueError("The FEMap contains no computational edges.")

    if source not in all_comp_sources:
        raise ValueError(f"Source {source} is not a valid source, available sources: {all_comp_sources}")

    # get the comp data from the computational source
    comp_data = rel_df[comp_mask & (rel_df["source"] == source)]
    # get the experimental data
    exp_data = rel_df[~comp_mask].rename(columns={value_col: "value_exp", uncertainty_col: "uncertainty_exp"})

    # merge to align the data and drop any values missing an experimental data point
    merged = comp_data.merge(exp_data, how="left", on=["labelA", "labelB"])
    merged = merged.dropna(subset=["value_exp"])

    # extract the required data
    x = merged["value_exp"].to_numpy(copy=True)
    y = merged[value_col].to_numpy(copy=True)
    xerr = merged["uncertainty_exp"].to_numpy(copy=True)
    yerr = merged[uncertainty_col].to_numpy(copy=True)

    # labels
    data_labels: list[str] = []
    if data_label_type:
        for _, row in merged.iterrows():
            node_a_name = row["labelA"]
            node_b_name = row["labelB"]
            if node_a_name.startswith("-") and node_b_name.startswith("-"):
                # factor out "-" if both start with it
                node_a_name = node_a_name[1:]
                node_b_name = node_b_name[1:]
                prefix = "-"
            else:
                prefix = ""
            if data_label_type == "small-molecule":
                data_labels.append(f"{prefix}({node_a_name}→{node_b_name})")
            elif data_label_type == "protein-mutation":
                data_labels.append(f"{prefix}({node_a_name}{node_b_name[0]})")
            else:
                raise ValueError(
                    "data_label_type unsupported. supported types: 'small-molecule' and 'protein-mutation'"
                )

    if symmetrise:
        x_data = np.append(x, [-i for i in x])
        y_data = np.append(y, [-i for i in y])
        xerr = np.append(xerr, xerr)
        yerr = np.append(yerr, yerr)
        # double the data labels list
        data_labels = data_labels + data_labels

    elif map_positive:
        x_data_map = []
        y_data_map = []
        for i, j in zip(x, y):
            if i < 0:
                x_data_map.append(-i)
                y_data_map.append(-j)
            else:
                x_data_map.append(i)
                y_data_map.append(j)
        x_data = np.asarray(x_data_map)
        y_data = np.asarray(y_data_map)
    else:
        x_data = np.asarray(x)
        y_data = np.asarray(y)

    if plotly:
        plotlying._master_plot(
            x_data,
            y_data,
            xerr=xerr,
            yerr=yerr,
            filename=filename,
            plot_type="ΔΔG",
            title=title,
            method_name=method_name,
            target_name=target_name,
            bootstrap_x_uncertainty=bootstrap_x_uncertainty,
            bootstrap_y_uncertainty=bootstrap_y_uncertainty,
            statistic_type=statistic_type,
            **kwargs,
        )
    else:
        # allow callers to override quantity/units via kwargs
        quantity, units, kwargs = _update_plot_kwargs(kwargs, plot_meta, observable_type, temperature)
        pair_plot(
            x_data,
            y_data,
            xerr=xerr,
            yerr=yerr,
            filename=filename,
            title=title,
            method_name=method_name,
            target_name=target_name,
            quantity=quantity,
            units=units,
            data_labels=data_labels,
            bootstrap_x_uncertainty=bootstrap_x_uncertainty,
            bootstrap_y_uncertainty=bootstrap_y_uncertainty,
            statistic_type=statistic_type,
            **kwargs,
        )




[docs]
def plot_DGs(
    femap: FEMap,
    source: str,
    method_name: str = "",
    target_name: str = "",
    title: str = "",
    filename: str | None = None,
    plotly: bool = False,
    centralizing: bool = True,
    shift: float = 0.0,
    bootstrap_x_uncertainty: bool = False,
    bootstrap_y_uncertainty: bool = False,
    statistic_type: Literal["mle", "mean"] = "mle",
    observable_type: ABSOLUTE_ANALYSIS_TYPES = "dg",
    temperature: Quantity = 298.15 * unit.kelvin,
    **kwargs,
):
    """Function to plot absolute free energies.

    Parameters
    ----------
    femap : FEMap
        FEMap object with absolute free energies to plot.
    source : str
        The name of the source label of the computational absolute values, if absolute values are generated with the MLE estimator this should be "MLE".
    method_name : string, default ""
        Name of method associated with results, e.g. "openfe" by default an empty string.
    target_name : string, default ""
        Name of system for results, e.g. "Thrombin" by default an empty string.
    title : string, default ""
        Title for the plot.
    filename : str | None, default None
        Filename for plot if None the plot will be displayed.
    bootstrap_x_uncertainty : bool, default False
        Whether to account for uncertainty in x when bootstrapping.
    bootstrap_y_uncertainty : bool, default False
        Whether to account for uncertainty in y when bootstrapping.
    statistic_type : {"mle", "mean"}, default "mle"
        The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic).
    observable_type : {"dg", "pic50"}, default "dg"
        The observable type to plot values in.  Defaults to ``dg`` (kcal/mol).
        Use ``pic50`` to plot pIC50 values.
    temperature : Quantity, default 298.15 * unit.kelvin
        Temperature used for the unit conversion, only used if observable_type is ``"pic50"``.

    Returns
    -------

    """

    # extract the data from the internal dataframes
    df = femap.get_absolute_dataframe(observable_type=observable_type, temperature=temperature)
    plot_meta = _OBSERVABLE_METADATA[observable_type.lower()]
    value_col = plot_meta["value_col"]
    uncertainty_col = plot_meta["uncertainty_col"]

    comp_mask = df["computational"]
    all_comp_sources = df.loc[comp_mask, "source"].unique().tolist()

    if not all_comp_sources:
        raise ValueError(
            "The FEMap contains no computed absolute values. "
            "Call generate_absolute_values() first or add calculated absolute measurements directly."
        )

    if source not in all_comp_sources:
        raise ValueError(f"Source {source} is not a valid source, available sources: {all_comp_sources}")

    # get the comp data from the computational source
    comp_data = df[comp_mask & (df["source"] == source)]
    # get the experimental data
    exp_data = df[~comp_mask].rename(columns={value_col: "value_exp", uncertainty_col: "uncertainty_exp"})

    # merge to align the data and drop any values missing an experimental data point
    merged = comp_data.merge(exp_data, how="left", on=["label"])
    merged = merged.dropna(subset=["value_exp"])

    # extract the required data
    x_data = merged["value_exp"].to_numpy(copy=True)
    y_data = merged[value_col].to_numpy(copy=True)
    xerr = merged["uncertainty_exp"].to_numpy(copy=True)
    yerr = merged[uncertainty_col].to_numpy(copy=True)

    # centralising
    # this should be replaced by providing one experimental result
    if centralizing:
        x_data = x_data - np.mean(x_data) + shift
        y_data = y_data - np.mean(y_data) + shift

    if plotly:
        plotlying._master_plot(
            x_data,
            y_data,
            xerr=xerr,
            yerr=yerr,
            origins=False,
            statistics=["RMSE", "MUE", "R2", "rho"],
            plot_type="ΔG",
            title=title,
            method_name=method_name,
            target_name=target_name,
            filename=filename,
            bootstrap_x_uncertainty=bootstrap_x_uncertainty,
            bootstrap_y_uncertainty=bootstrap_y_uncertainty,
            statistic_type=statistic_type,
            **kwargs,
        )
    else:
        quantity, units, kwargs = _update_plot_kwargs(kwargs, plot_meta, observable_type, temperature)
        pair_plot(
            x_data,
            y_data,
            xerr=xerr,
            yerr=yerr,
            origins=False,
            statistics=["RMSE", "MUE", "R2", "rho"],
            quantity=quantity,
            units=units,
            title=title,
            method_name=method_name,
            target_name=target_name,
            filename=filename,
            bootstrap_x_uncertainty=bootstrap_x_uncertainty,
            bootstrap_y_uncertainty=bootstrap_y_uncertainty,
            statistic_type=statistic_type,
            **kwargs,
        )




[docs]
def plot_all_DDGs(
    femap: FEMap,
    source: str,
    method_name: str = "",
    target_name: str = "",
    title: str = "",
    filename: str | None = None,
    plotly: bool = False,
    shift: float = 0.0,
    bootstrap_x_uncertainty: bool = False,
    bootstrap_y_uncertainty: bool = False,
    statistic_type: Literal["mle", "mean"] = "mle",
    observable_type: RELATIVE_ANALYSIS_TYPES = "ddg",
    temperature: Quantity = 298.15 * unit.kelvin,
    **kwargs,
):
    """Plots relative free energies between all ligands, which is calculated from
    the differences between all the absolute free energies. This data is different to ``plot_DGs``.

    Parameters
    ----------
    femap : FEMap
        FEMap object with absolute free energies to plot.
    source : str
        The name of the source label of the computational absolute values, if absolute values are generated with the MLE estimator this should be "MLE".
    method_name : string, default ""
        Name of method associated with results, e.g. "openfe" by default an empty string.
    target_name : string, default ""
        Name of system for results, e.g. "Thrombin" by default an empty string.
    title : string, default ""
        Title for the plot.
    filename : str | None, default None
        Filename for plot if None the plot will be displayed.
    plotly : bool, default True
        Whether to use plotly for the plotting.
    shift : float, default 0.
        Shift both the x and y-axis by a constant.
    bootstrap_x_uncertainty : bool, default False
        Whether to account for uncertainty in x when bootstrapping.
    bootstrap_y_uncertainty : bool, default False
        Whether to account for uncertainty in y when bootstrapping.
    statistic_type : {"mle", "mean"}, default "mle"
        The type of statistic to use, either "mle" (i.e. sample statistic) or "mean" (i.e. bootstrapped mean statistic).
    observable_type : {"ddg", "dpic50"}, default "ddg"
        The observable type to plot values in.  Defaults to ``ddg`` (kcal/mol).
        Use ``dpic50`` to plot DpIC50 values.
    temperature : Quantity, default 298.15 * unit.kelvin
        Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``.

    Returns
    -------

    """
    # use the internal dataframes to get the pairwise differences
    rel_df = femap.get_all_to_all_relative_dataframe(
        symmetrical=True, observable_type=observable_type, temperature=temperature
    )
    plot_meta = _OBSERVABLE_METADATA[observable_type.lower()]
    value_col = plot_meta["value_col"]
    uncertainty_col = plot_meta["uncertainty_col"]

    comp_mask = rel_df["computational"]
    all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist()

    if not all_comp_sources:
        raise ValueError(
            "The FEMap contains no computed absolute values which are needed to obtain the all-to-all pairwise DDGs. "
            "Call generate_absolute_values() first or add calculated absolute measurements directly."
        )

    if source not in all_comp_sources:
        raise ValueError(f"Source {source} is not a valid source, available sources: {all_comp_sources}")

    # get the comp data from the computational source
    comp_data = rel_df[comp_mask & (rel_df["source"] == source)]
    # get the experimental data
    exp_data = rel_df[~comp_mask].rename(columns={value_col: "value_exp", uncertainty_col: "uncertainty_exp"})

    # merge to align the data and drop any values missing an experimental data point
    merged = comp_data.merge(exp_data, how="left", on=["labelA", "labelB"])
    merged = merged.dropna(subset=["value_exp"])

    # extract the required data
    x_data = merged["value_exp"].to_numpy(copy=True)
    y_data = merged[value_col].to_numpy(copy=True)
    xerr = merged["uncertainty_exp"].to_numpy(copy=True)
    yerr = merged[uncertainty_col].to_numpy(copy=True)

    if plotly:
        plotlying._master_plot(
            x_data,
            y_data,
            xerr=xerr,
            yerr=yerr,
            title=title,
            method_name=method_name,
            plot_type="ΔΔG",
            filename=filename,
            target_name=target_name,
            bootstrap_x_uncertainty=bootstrap_x_uncertainty,
            bootstrap_y_uncertainty=bootstrap_y_uncertainty,
            statistic_type=statistic_type,
            **kwargs,
        )

    else:
        quantity, units, kwargs = _update_plot_kwargs(kwargs, plot_meta, observable_type, temperature)
        pair_plot(
            x_data,
            y_data,
            xerr=xerr,
            yerr=yerr,
            title=title,
            method_name=method_name,
            filename=filename,
            target_name=target_name,
            quantity=quantity,
            units=units,
            shift=shift,
            bootstrap_x_uncertainty=bootstrap_x_uncertainty,
            bootstrap_y_uncertainty=bootstrap_y_uncertainty,
            statistic_type=statistic_type,
            **kwargs,
        )




[docs]
def ecdf_plot(
    datasets: dict[str, np.ndarray],
    title: str | None = "ECDF of Absolute Errors",
    xlabel: str = "Edgewise",
    quantity: str = r"$|\Delta\Delta$G$_{calc} - \Delta\Delta$G$_{exp}|$",
    units: str = r"$\mathrm{kcal\,mol^{-1}}$",
    ylabel: str = "Cumulative Probability",
    figsize: float | tuple[float, float] = 4,
    colors: list[str] | None = None,
    ecdf_kwargs: dict[str, Any] | None = None,
    filename: str | None = None,
    nbootstraps: int = 1_000,
    ci: float = 0.95,
) -> plt.Figure:
    r"""
    Plot ECDFs for one or more datasets. Where the dataset is a flat array of absolute errors.

    Parameters
    ----------
    datasets : dict[str, np.ndarray]
        A dictionary where keys are dataset labels and values are the data arrays.
    title: str | None, default "ECDF of Absolute Errors"
        Title for the plot. If None, no title is set.
    xlabel : str, default "Edgewise"
        Label for the x-axis.
    quantity : str, default r"$\Delta\Delta$G"
        Metric that is being plotted.
    units : str, default r"$\mathrm{kcal\,mol^{-1}}$"
        Units of the metric being plotted.
    ylabel : str, default "Cumulative Probability"
        Label for the y-axis.
    figsize : float | tuple[float, float], default 4
        Size of the figure.
    colors : list[str] | None, default None
        List of colors for each dataset. If None, default colors are used.
    ecdf_kwargs : dict, default None
        Additional keyword arguments to pass to seaborn.ecdfplot.
    filename : str | None, default None
        If provided, the plot will be saved to this filename.
    nbootstraps : int, default = 1_000
        Number of bootstraps to perform for estimating confidence intervals.
    ci : float, default = 0.95
        Confidence level for the confidence intervals (e.g., 0.95 for 95% confidence intervals).

    Returns
    -------
    plt.Figure
        The matplotlib Figure object containing the ECDF plot which can be edited further.
    """
    if ecdf_kwargs is None:
        ecdf_kwargs = {}

    if not datasets:
        raise ValueError("At least one dataset is required to plot an ECDF.")

    # make sure 0 < ci < 1
    if not 0 < ci < 1:
        raise ValueError("ci must be between 0 and 1.")

    if not isinstance(figsize, tuple):
        figsize = (figsize, figsize)
    fig, axs = plt.subplots(figsize=figsize)

    # make the default ecdf_kwargs for the plot
    default_kwargs = {
        "ax": axs,
        "linewidth": 2,
    }
    default_kwargs.update(ecdf_kwargs)

    if colors is None:
        # get the default colors
        colors = sns.color_palette(None, n_colors=len(datasets))

    # Iterate over the dictionary to plot ECDFs
    for i, (label, data) in enumerate(datasets.items()):
        # Pick a color for the dataset if specified
        color = colors[i]
        default_kwargs["color"] = color

        # we first need to sort the data so its in the same order if CIs are plotted
        data = np.sort(data)
        sns.ecdfplot(data, label=label, **default_kwargs)

        # estimate an error bounds via bootsrapping over the data if the number of bootstraps is > 0
        if nbootstraps > 0:
            boot_ecdfs = []
            for _ in range(nbootstraps):
                sample = np.random.choice(data, size=len(data), replace=True)
                # calculate the ECDF for this sample at each point in the true sample data
                # searchsorted returns an array of indices in sample where the values in data would fit in
                # by dividing by the length of the sample we get the fraction of sample points that are less than or
                # equal to each point in the data, which is the ECDF value at that point
                boot_ecdf = np.searchsorted(np.sort(sample), data, side="right") / len(sample)
                boot_ecdfs.append(boot_ecdf)

            # calculate the confidence interval based on the user input
            low_percentile = (1.0 - ci) / 2.0 * 100
            high_percentile = 100 - low_percentile
            lower = np.percentile(boot_ecdfs, low_percentile, axis=0)
            upper = np.percentile(boot_ecdfs, high_percentile, axis=0)
            # now plot a shaded region between the confidence intervals
            plt.fill_between(data, lower, upper, alpha=0.2, color=color)

    if title is not None:
        plt.title(title, fontsize=14)

    plt.xlabel(f"{xlabel} {quantity} ({units})", fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.xlim(left=0)
    plt.legend()
    # add gridlines to help identify 1, 2 kcal/mol errors
    plt.grid(alpha=0.3)
    plt.tight_layout()
    if filename is not None:
        plt.savefig(filename, bbox_inches="tight", dpi=300)
    return fig




[docs]
def ecdf_plot_DDGs(
    femap: FEMap,
    sources: list[str] | None = None,
    labels: list[str] | None = None,
    title: str | None = "ECDF of Edgewise Absolute Errors",
    filename: str | None = None,
    observable_type: RELATIVE_ANALYSIS_TYPES = "ddg",
    temperature: Quantity = 298.15 * unit.kelvin,
    **kwargs,
) -> plt.Figure:
    """
    Plot ECDF of absolute errors for edgewise relative free energies from multiple sources within a single FEMap.

    Each computational source in the FEMap (i.e. each distinct ``source`` tag on
    the relative calculations) is plotted as a separate ECDF curve.

    Parameters
    ----------
    femap : FEMap
        A single FEMap containing one or more computational sources together
        with experimental absolute measurements.
    sources : list[str] | None, default None
        Computational source names to include in the plot.  If ``None``, all
        computational sources are used.
    labels : list[str] | None, default None
        Display labels for the legend, one per source.  If ``None``, the source
        names are used directly, this should be used with ``sources`` to ensure the correct label is applied to the corresponding source.
    title : str | None, default "ECDF of Edgewise Absolute Errors"
        Title for the plot. If ``None``, no title is set.
    filename : str | None, default None
        If provided, the plot will be saved to this filename.
    observable_type : {"ddg", "dpic50"}, default "ddg"
        The observable type to plot values in.  Defaults to ``ddg`` (kcal/mol).
        Use ``dpic50`` to report DpIC50 values.
    temperature : Quantity, default 298.15 * unit.kelvin
        Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``.
    **kwargs
        Additional keyword arguments to pass to `ecdf_plot`.

    Returns
    -------
    plt.Figure
        The matplotlib Figure object containing the ECDF plot which can be edited further.

    Notes
    -----
    If any edges are missing an experimental value, they will be skipped in the absolute error calculation.

    Raises
    ------
    ValueError
        If any edges are missing a calculated DDG value, if the number of sources and labels do not match or if no
        computational results are in the graph.
    """
    rel_df = femap.get_relative_dataframe(observable_type=observable_type, temperature=temperature)
    plot_meta = _OBSERVABLE_METADATA[observable_type.lower()]
    value_col = plot_meta["value_col"]
    comp_mask = rel_df["computational"]
    all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist()

    if not all_comp_sources:
        raise ValueError("The FEMap contains no computational edges.")

    if sources is None:
        sources = all_comp_sources
    if labels is None:
        labels = list(sources)

    if len(sources) != len(labels):
        raise ValueError(f"`sources` and `labels` must have the same length, got {len(sources)} and {len(labels)}.")

    # get the experimental data
    exp_df = rel_df[~comp_mask].set_index(["labelA", "labelB"])[[value_col]].rename(columns={value_col: "value_exp"})

    datasets = {}
    for source, label in zip(sources, labels):
        src_df = rel_df[comp_mask & (rel_df["source"] == source)].set_index(["labelA", "labelB"])[[value_col]]
        if src_df.empty:
            raise ValueError(f"No computational edges found for source '{source}'.")

        merged = src_df.join(exp_df, how="left")
        # skip edges without an experimental reference DDG
        merged = merged.dropna(subset=["value_exp"])
        datasets[label] = np.abs(merged[value_col] - merged["value_exp"]).values

    # finally check all datasets are the same length
    if len({len(v) for v in datasets.values()}) != 1:
        raise ValueError(
            "Inconsistent number of computational edges across sources, make sure all edges have a result for each source."
        )

    # allow callers to override quantity/units via kwargs
    quantity = kwargs.pop("quantity", plot_meta["ecdf_quantity"])
    units = kwargs.pop("units", plot_meta["units"])
    return ecdf_plot(
        datasets,
        title=title,
        filename=filename,
        xlabel="Edgewise",
        quantity=quantity,
        units=units,
        **kwargs,
    )




[docs]
def ecdf_plot_DGs(
    femap: FEMap,
    sources: list[str] | None = None,
    labels: list[str] | None = None,
    title: str | None = "ECDF of Nodewise Absolute Errors",
    filename: str | None = None,
    centralizing: bool = True,
    observable_type: ABSOLUTE_ANALYSIS_TYPES = "dg",
    temperature: Quantity = 298.15 * unit.kelvin,
    **kwargs,
) -> plt.Figure:
    """
    Plot ECDF of absolute errors for nodewise absolute free energies from multiple sources within a single FEMap.

    Each computational source in the FEMap is plotted as a separate ECDF curve.

    Parameters
    ----------
    femap : FEMap
        A single FEMap containing one or more computational absolute sources together with experimental absolute measurements.
    sources : list[str] | None, default None
        Computational source names to include in the plot.
        If ``None``, all computational sources are used.
    labels : list[str] | None, default None
        Display labels for the legend, one per source.  If ``None``, the source
        names are used directly, this should be used with ``sources`` to ensure the correct label is applied to the corresponding source.
    title : str | None, default "ECDF of Nodewise Absolute Errors"
        Title for the plot.
    filename : str | None, default None
        If provided, the plot will be saved to this filename.
    centralizing : bool, default True
        whether to center both calculated and experimental values around zero before calculating absolute errors.
    observable_type : {"dg", "pic50"}, default "dg"
        The observable type to plot values in.  Defaults to ``dg`` (kcal/mol).
        Use ``pic50`` to report pIC50 values.
    temperature : Quantity, default 298.15 * unit.kelvin
        Temperature used for the unit conversion, only used if observable_type is ``"pic50"``.
    **kwargs
        Additional keyword arguments to pass to `ecdf_plot`.

    Returns
    -------
    plt.Figure
        The matplotlib Figure object containing the ECDF plot which can be edited further.

    Raises
    ------
    ValueError
        If any ligands are missing a calculated DG value, if the number of sources and labels do not match or if no
        computational results are in the graph.
    """
    df = femap.get_absolute_dataframe(observable_type=observable_type, temperature=temperature)
    plot_meta = _OBSERVABLE_METADATA[observable_type.lower()]
    value_col = plot_meta["value_col"]
    comp_mask = df["computational"]
    all_comp_sources = df.loc[comp_mask, "source"].unique().tolist()

    if not all_comp_sources:
        raise ValueError(
            "The FEMap contains no computed absolute values. "
            "Call generate_absolute_values() first or add calculated absolute measurements directly."
        )

    if sources is None:
        sources = all_comp_sources
    if labels is None:
        labels = list(sources)

    if len(sources) != len(labels):
        raise ValueError(f"`sources` and `labels` must have the same length, got {len(sources)} and {len(labels)}.")

    exp_df = (
        df[~comp_mask]
        .drop_duplicates(subset=["label"])
        .set_index("label")[[value_col]]
        .rename(columns={value_col: "value_exp"})
    )

    datasets = {}
    for source, label in zip(sources, labels):
        src_df = df[comp_mask & (df["source"] == source)].set_index("label")[[value_col]]
        if src_df.empty:
            raise ValueError(f"No computed absolute values found for source '{source}'.")

        merged = src_df.join(exp_df, how="left")
        merged = merged.dropna(subset=["value_exp"])
        x = merged["value_exp"].values
        y = merged[value_col].values
        if centralizing:
            x = x - x.mean()
            y = y - y.mean()
        datasets[label] = np.abs(y - x)

    if len({len(v) for v in datasets.values()}) != 1:
        raise ValueError(
            "Inconsistent number of computational nodes across sources, make sure all nodes have a result for each source."
        )

    # allow callers to override quantity/units via kwargs
    quantity = kwargs.pop("quantity", plot_meta["ecdf_quantity"])
    units = kwargs.pop("units", plot_meta["units"])
    return ecdf_plot(
        datasets,
        title=title,
        xlabel="Nodewise",
        quantity=quantity,
        units=units,
        filename=filename,
        **kwargs,
    )




[docs]
def ecdf_plot_all_DDGs(
    femap: FEMap,
    sources: list[str] | None = None,
    labels: list[str] | None = None,
    title: str | None = "ECDF of Pairwise (all-to-all) Absolute Errors",
    filename: str | None = None,
    observable_type: RELATIVE_ANALYSIS_TYPES = "ddg",
    temperature: Quantity = 298.15 * unit.kelvin,
    **kwargs,
) -> plt.Figure:
    """
    Plot ECDF of absolute errors for all-to-all relative free energies calculated from absolute free energies in a graph.

    Parameters
    ----------
    femap : FEMap
        A single FEMap containing one or more computational absolute sources together with experimental absolute measurements.
    sources : list[str] | None, default None
        Computational source names to include.  If ``None``, all computational
        sources in the absolute dataframe are used.
    labels : list[str] | None, default None
        Display labels for the legend.  Defaults to the source names.
    title : str | None, default "ECDF of Pairwise (all-to-all) Absolute Errors"
        Title for the plot.
    filename : str | None, default None
        If provided, the plot will be saved to this filename.
    observable_type : {"ddg", "dpic50"}, default "ddg"
        The observable type to plot values in.  Defaults to ``ddg`` (kcal/mol).
        Use ``dpic50`` to report DpIC50 values.
    temperature : Quantity, default 298.15 * unit.kelvin
        Temperature used for the unit conversion, only used if observable_type is ``"dpic50"``.
    **kwargs
        Additional keyword arguments to pass to `ecdf_plot`.

    Returns
    -------
    plt.Figure
        The matplotlib Figure object containing the ECDF plot which can be edited further.

    Raises
    ------
    ValueError
        If the FEMap contains no computed absolute values, if a requested
        source cannot be found, or if ``sources`` and ``labels`` differ in
        length.

    Notes
    -----
    Ligands without experimental absolute values are excluded before computing
    pairwise combinations. Only unique (unordered) pairs are included, so N
    ligands contribute N*(N-1)/2 data points.
    """
    rel_df = femap.get_all_to_all_relative_dataframe(
        symmetrical=False, observable_type=observable_type, temperature=temperature
    )
    plot_meta = _OBSERVABLE_METADATA[observable_type.lower()]
    value_col = plot_meta["value_col"]
    comp_mask = rel_df["computational"]
    all_comp_sources = rel_df.loc[comp_mask, "source"].unique().tolist()

    if not all_comp_sources:
        raise ValueError(
            "The FEMap contains no computed absolute values which are need to obtain the all-to-all pairwise DDGs. "
            "Call generate_absolute_values() first or add calculated absolute measurements directly."
        )

    if sources is None:
        sources = all_comp_sources
    if labels is None:
        labels = list(sources)

    if len(sources) != len(labels):
        raise ValueError(f"`sources` and `labels` must have the same length, got {len(sources)} and {len(labels)}.")

    exp_df = rel_df[~comp_mask].set_index(["labelA", "labelB"])[[value_col]].rename(columns={value_col: "value_exp"})

    datasets = {}
    for source, label in zip(sources, labels):
        src_df = rel_df[comp_mask & (rel_df["source"] == source)].set_index(["labelA", "labelB"])[[value_col]]
        if src_df.empty:
            raise ValueError(f"No computational edges found for source '{source}'.")

        merged = src_df.join(exp_df, how="left")
        # skip edges without an experimental reference DDG
        merged = merged.dropna(subset=["value_exp"])
        datasets[label] = np.abs(merged[value_col] - merged["value_exp"]).values

    # finally check all datasets are the same length
    if len({len(v) for v in datasets.values()}) != 1:
        raise ValueError(
            "Inconsistent number of computational edges across sources, make sure all edges have a result for each source."
        )

    # allow callers to override quantity/units via kwargs
    quantity = kwargs.pop("quantity", plot_meta["ecdf_quantity"])
    units = kwargs.pop("units", plot_meta["units"])
    return ecdf_plot(
        datasets,
        title=title,
        xlabel="Pairwise",
        quantity=quantity,
        units=units,
        filename=filename,
        **kwargs,
    )




[docs]
def plot_cycle_closure(
    fe_map: FEMap,
    filename: str | None,
    max_cycle_length: int = 5,
    sources: list[str] | None = None,
    bin_width: float = 0.5,
) -> plt.Figure:
    """
    Plot a histogram of cycle closure errors, taking the ``cc_per_edge (kcal/mol)``
    which is the cycle closure divided by the square root of the cycle length.

    Parameters
    ----------
    fe_map : FEMap
        FEMap object containing the calculated edges.
    filename : str | None, default None
        If provided, the plot will be saved to this filename.
    max_cycle_length : int, default 5
        Only consider cycles up to this length. Defaults to 5.
    sources : list[str] | None, default None
        List of sources to plot. If None, all sources are plotted.
    bin_width : float, default 0.5
        Width of histogram bins in kcal/mol. Default: 0.5

    Returns
    -------
    plt.Figure
        The matplotlib Figure object containing the histogram which can be edited further.

    Raises
    ------
    ValueError
        If the FEMap contains no cycles, or if a requested
        source cannot be found.
    """
    df = fe_map.get_cycle_closure_dataframe(max_cycle_length=max_cycle_length)

    if df.empty:
        raise ValueError("The FEMap does not contain cycles.")

    if sources is not None:
        df = df[df["source"].isin(sources)]
        if df.empty:
            raise ValueError(f"No cycles found for sources {sources}.")

    unique_sources = df["source"].unique()

    fig, ax = plt.subplots(figsize=(5, 4))

    max_val = df["cc_per_edge (kcal/mol)"].max()
    bins = np.arange(0, max_val + bin_width, bin_width).tolist()

    for source in unique_sources:
        source_df = df[df["source"] == source]
        ax.hist(source_df["cc_per_edge (kcal/mol)"], bins=bins, alpha=0.6, label=source)

    ax.set_xlabel(r"Cycle closure per edge (kcal mol$^{-1}$)")
    ax.set_ylabel("Count")
    ax.set_title("Cycle closure distribution")
    ax.legend()
    fig.tight_layout()

    if filename is None:
        plt.show()
    else:
        fig.savefig(filename, bbox_inches="tight", dpi=300)

    return fig