Source code for plot_algorithms

# plot_algorithms.py
"""
Algorithm comparison plotting module for ML results analysis.
Focuses on comparing algorithm performance with outcome variable stratification.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from typing import List, Dict, Optional, Union, Tuple
from ml_grid.results_processing.core import get_clean_data
import warnings

# Maximum number of outcomes to display in stratified plots to avoid clutter.

[docs]
MAX_OUTCOMES_FOR_STRATIFIED_PLOT = 20




[docs]
class AlgorithmComparisonPlotter:
    """A class for creating algorithm comparison visualizations."""

    def __init__(self, data: pd.DataFrame):
        """Initializes the AlgorithmComparisonPlotter.

        Args:
            data (pd.DataFrame): A DataFrame containing the experiment results.
        """

[docs]
        self.data = data


[docs]
        self.clean_data = get_clean_data(data)


        # Set style
        plt.style.use("default")
        sns.set_palette("husl")


[docs]
    def plot_algorithm_boxplots(
        self,
        metric: str = "auc",
        algorithms_to_plot: Optional[List[str]] = None,
        stratify_by_outcome: bool = False,
        outcomes_to_plot: Optional[List[str]] = None,
        figsize: Tuple[int, int] = (12, 6),
    ) -> None:
        """Creates box plots comparing algorithm performance.

        Args:
            metric (str, optional): The performance metric to compare.
                Defaults to 'auc'.
            algorithms_to_plot (Optional[List[str]], optional): A list of
                specific algorithms to include. If None, all are used.
                Defaults to None.
            stratify_by_outcome (bool, optional): If True, creates separate
                plots for each outcome. Defaults to False.
            outcomes_to_plot (Optional[List[str]], optional): A list of
                specific outcomes to plot. If None, all are used.
                Defaults to None.
            figsize (Tuple[int, int], optional): The figure size.
                Defaults to (12, 6).

        Raises:
            ValueError: If the specified metric is not found in the data.
        """
        if metric not in self.clean_data.columns:
            raise ValueError(f"Metric '{metric}' not found in data")

        if not stratify_by_outcome:
            self._plot_single_algorithm_boxplot(metric, algorithms_to_plot, figsize)
        else:
            self._plot_stratified_algorithm_boxplots(
                metric, algorithms_to_plot, outcomes_to_plot, figsize
            )


    def _plot_single_algorithm_boxplot(
        self,
        metric: str,
        algorithms_to_plot: Optional[List[str]],
        figsize: Tuple[int, int],
    ) -> None:
        """Create single box plot for all outcomes combined."""
        plot_data = self.clean_data.copy()

        if algorithms_to_plot:
            plot_data = plot_data[plot_data["method_name"].isin(algorithms_to_plot)]

        plt.figure(figsize=figsize)

        # Create box plot
        sns.boxplot(
            data=plot_data, x="method_name", y=metric, showfliers=True, whis=1.5
        )

        # Add mean markers
        for i, algo in enumerate(plot_data["method_name"].unique()):
            algo_data = plot_data[plot_data["method_name"] == algo][metric]
            mean_val = algo_data.mean()
            plt.scatter(
                i, mean_val, color="red", s=100, marker="D", zorder=10, label="Mean" if i == 0 else ""
            )

        plt.xticks(rotation=45, ha="right")
        plt.xlabel("Algorithm", fontsize=12)
        plt.ylabel(metric.upper(), fontsize=12)
        plt.title(
            f"{metric.upper()} Performance by Algorithm - All Outcomes",
            fontsize=14,
            fontweight="bold",
        )
        plt.grid(True, alpha=0.3)
        plt.legend()

        # Add sample size annotations
        for i, algo in enumerate(plot_data["method_name"].unique()):
            n_samples = len(plot_data[plot_data["method_name"] == algo])
            plt.text(
                i,
                plt.ylim()[0],
                f"n={n_samples}",
                ha="center",
                va="top",
                fontsize=9,
                fontweight="bold",
            )

        plt.tight_layout()
        plt.show()

    def _plot_stratified_algorithm_boxplots(
        self,
        metric: str,
        algorithms_to_plot: Optional[List[str]],
        outcomes_to_plot: Optional[List[str]],
        figsize: Tuple[int, int],
    ) -> None:
        """Create stratified box plots by outcome variable."""
        if "outcome_variable" not in self.clean_data.columns:
            raise ValueError("outcome_variable column not found for stratification")

        outcomes = outcomes_to_plot or sorted(
            self.clean_data["outcome_variable"].unique()
        )
        if len(outcomes) > MAX_OUTCOMES_FOR_STRATIFIED_PLOT:
            warnings.warn(
                f"Found {len(outcomes)} outcomes, which is more than the display limit of {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
                f"Displaying the first {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
                "Use the 'outcomes_to_plot' parameter to select specific outcomes.",
                stacklevel=2
            )
            outcomes = outcomes[:MAX_OUTCOMES_FOR_STRATIFIED_PLOT]

        n_outcomes = len(outcomes)

        # Calculate subplot layout
        cols = min(3, n_outcomes)
        rows = (n_outcomes + cols - 1) // cols

        fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))

        if n_outcomes == 1:
            axes = [axes]
        elif rows == 1:
            axes = [axes] if cols == 1 else axes
        else:
            axes = axes.flatten()

        for i, outcome in enumerate(outcomes):
            ax = axes[i]
            outcome_data = self.clean_data[
                self.clean_data["outcome_variable"] == outcome
            ]

            if algorithms_to_plot:
                outcome_data = outcome_data[
                    outcome_data["method_name"].isin(algorithms_to_plot)
                ]

            if len(outcome_data) > 0:
                sns.boxplot(data=outcome_data, x="method_name", y=metric, ax=ax)

                # Add means
                for j, algo in enumerate(outcome_data["method_name"].unique()):
                    algo_data = outcome_data[outcome_data["method_name"] == algo][
                        metric
                    ]
                    if len(algo_data) > 0:
                        mean_val = algo_data.mean()
                        ax.scatter(
                            j, mean_val, color="red", s=60, marker="D", zorder=10
                        )

                ax.tick_params(axis="x", rotation=45)
                plt.setp(ax.get_xticklabels(), ha="right")
                ax.set_title(
                    f"{outcome}\n{metric.upper()}", fontsize=11, fontweight="bold"
                )
                ax.set_xlabel("Algorithm" if i >= len(outcomes) - cols else "")
                ax.set_ylabel(metric.upper() if i % cols == 0 else "")
                ax.grid(True, alpha=0.3)
            else:
                ax.text(
                    0.5,
                    0.5,
                    "No Data",
                    transform=ax.transAxes,
                    ha="center",
                    va="center",
                    fontsize=12,
                )
                ax.set_title(f"{outcome}", fontsize=11)

        # Hide extra subplots
        for j in range(i + 1, len(axes)):
            axes[j].set_visible(False)

        plt.suptitle(
            f"{metric.upper()} Performance by Algorithm and Outcome",
            fontsize=16,
            fontweight="bold",
        )
        plt.tight_layout()
        plt.show()


[docs]
    def plot_algorithm_performance_heatmap(
        self,
        metric: str = "auc",
        algorithms_to_plot: Optional[List[str]] = None,
        outcomes_to_plot: Optional[List[str]] = None,
        aggregation: str = "mean",
        figsize: Tuple[int, int] = (12, 8),
    ) -> pd.DataFrame:
        """Creates a heatmap showing algorithm performance across outcomes.

        Args:
            metric (str, optional): The performance metric to visualize.
                Defaults to 'auc'.
            algorithms_to_plot (Optional[List[str]], optional): A list of
                specific algorithms to include. Defaults to None.
            outcomes_to_plot (Optional[List[str]], optional): A list of
                specific outcomes to include. Defaults to None.
            aggregation (str, optional): How to aggregate multiple runs
                ('mean', 'median', 'max'). Defaults to 'mean'.
            figsize (Tuple[int, int], optional): The figure size.
                Defaults to (12, 8).

        Raises:
            ValueError: If 'outcome_variable' column is missing or an invalid
                aggregation method is provided.

        Returns:
            pd.DataFrame: The pivot table data used for the heatmap.
        """
        if "outcome_variable" not in self.clean_data.columns:
            raise ValueError("outcome_variable column not found")

        plot_data = self.clean_data.copy()

        if algorithms_to_plot:
            plot_data = plot_data[plot_data["method_name"].isin(algorithms_to_plot)]

        if outcomes_to_plot:
            plot_data = plot_data[plot_data["outcome_variable"].isin(outcomes_to_plot)]

        # Create pivot table
        if aggregation == "mean":
            heatmap_data = plot_data.pivot_table(
                values=metric, index="method_name", columns="outcome_variable", aggfunc="mean"
            )
        elif aggregation == "median":
            heatmap_data = plot_data.pivot_table(
                values=metric, index="method_name", columns="outcome_variable", aggfunc="median"
            )
        elif aggregation == "max":
            heatmap_data = plot_data.pivot_table(
                values=metric, index="method_name", columns="outcome_variable", aggfunc="max"
            )
        else:
            raise ValueError("aggregation must be 'mean', 'median', or 'max'")

        plt.figure(figsize=figsize)

        # Create heatmap
        sns.heatmap(
            heatmap_data,
            annot=True,
            fmt=".3f",
            cmap="viridis",
            cbar_kws={"label": f"{aggregation.title()} {metric.upper()}"},
        )

        plt.title(
            f"{aggregation.title()} {metric.upper()} Performance Heatmap",
            fontsize=14,
            fontweight="bold",
        )
        plt.xlabel("Outcome Variable", fontsize=12)
        plt.ylabel("Algorithm", fontsize=12)
        plt.xticks(rotation=45, ha="right")
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()

        return heatmap_data



[docs]
    def plot_algorithm_ranking(
        self,
        metric: str = "auc",
        algorithms_to_plot: Optional[List[str]] = None,
        stratify_by_outcome: bool = False,
        outcomes_to_plot: Optional[List[str]] = None,
        top_n: int = 10,
        figsize: Tuple[int, int] = (10, 8),
    ) -> None:
        """Plots a ranked bar chart of algorithm performance.

        Args:
            metric (str, optional): The performance metric to rank by.
                Defaults to 'auc'.
            algorithms_to_plot (Optional[List[str]], optional): A list of
                specific algorithms to include. Defaults to None.
            stratify_by_outcome (bool, optional): If True, creates separate
                plots for each outcome. Defaults to False.
            outcomes_to_plot (Optional[List[str]], optional): A list of
                specific outcomes to plot when stratified. Defaults to None.
            top_n (int, optional): The number of top algorithms to display.
                Defaults to 10.
            figsize (Tuple[int, int], optional): The figure size.
                Defaults to (10, 8).

        Raises:
            ValueError: If the specified metric is not found, or if stratifying
                and 'outcome_variable' column is missing.
        """
        if metric not in self.clean_data.columns:
            raise ValueError(f"Metric '{metric}' not found in data")

        if not stratify_by_outcome:
            self._plot_single_ranking(metric, algorithms_to_plot, top_n, figsize)
        else:
            if "outcome_variable" not in self.clean_data.columns:
                raise ValueError("outcome_variable column not found for stratification")
            self._plot_stratified_ranking(
                metric, algorithms_to_plot, outcomes_to_plot, top_n, figsize
            )


    def _plot_single_ranking(
        self,
        metric: str,
        algorithms_to_plot: Optional[List[str]],
        top_n: int,
        figsize: Tuple[int, int],
    ) -> None:
        """Plot a single ranked bar chart for all outcomes combined."""
        plot_data = self.clean_data.copy()

        if algorithms_to_plot:
            plot_data = plot_data[plot_data["method_name"].isin(algorithms_to_plot)]

        # Calculate mean performance for each algorithm
        ranking = (
            plot_data.groupby("method_name")[metric].mean().sort_values(ascending=False)
        )

        # Select top N
        ranking = ranking.head(top_n)

        plt.figure(figsize=figsize)

        ax = sns.barplot(
            x=ranking.values,
            y=ranking.index,
            hue=ranking.index,
            orient="h",
            palette="viridis",
            legend=False,
        )

        ax.set_title(
            f"Top {top_n} Algorithms by Mean {metric.upper()} - All Outcomes",
            fontsize=14,
            fontweight="bold",
        )
        ax.set_xlabel(f"Mean {metric.upper()}", fontsize=12)
        ax.set_ylabel("Algorithm", fontsize=12)

        # Add value labels to bars
        for container in ax.containers:
            ax.bar_label(container, fmt="%.3f", padding=3)

        plt.tight_layout()
        plt.show()

    def _plot_stratified_ranking(
        self,
        metric: str,
        algorithms_to_plot: Optional[List[str]],
        outcomes_to_plot: Optional[List[str]],
        top_n: int,
        figsize: Tuple[int, int],
    ) -> None:
        """Plot stratified ranking bar charts by outcome."""
        outcomes = outcomes_to_plot or sorted(
            self.clean_data["outcome_variable"].unique()
        )
        if len(outcomes) > MAX_OUTCOMES_FOR_STRATIFIED_PLOT:
            warnings.warn(
                f"Found {len(outcomes)} outcomes, which is more than the display limit of {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
                f"Displaying the first {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
                "Use the 'outcomes_to_plot' parameter to select specific outcomes.",
                stacklevel=2,
            )
            outcomes = outcomes[:MAX_OUTCOMES_FOR_STRATIFIED_PLOT]

        n_outcomes = len(outcomes)

        cols = min(2, n_outcomes)
        rows = (n_outcomes + cols - 1) // cols

        fig, axes = plt.subplots(
            rows, cols, figsize=(cols * 7, rows * 5), squeeze=False
        )
        axes = axes.flatten()

        for i, outcome in enumerate(outcomes):
            ax = axes[i]
            outcome_data = self.clean_data[
                self.clean_data["outcome_variable"] == outcome
            ]

            if algorithms_to_plot:
                outcome_data = outcome_data[
                    outcome_data["method_name"].isin(algorithms_to_plot)
                ]

            if len(outcome_data) > 0:
                ranking = (
                    outcome_data.groupby("method_name")[metric]
                    .mean()
                    .sort_values(ascending=False)
                    .head(top_n)
                )

                if not ranking.empty:
                    sns.barplot(
                        x=ranking.values,
                        y=ranking.index,
                        hue=ranking.index,
                        orient="h",
                        ax=ax,
                        palette="plasma",
                        legend=False,
                    )
                    ax.set_title(
                        f"{outcome} - Top {min(top_n, len(ranking))} Algorithms",
                        fontsize=11,
                        fontweight="bold",
                    )
                    ax.set_xlabel(f"Mean {metric.upper()}")
                    ax.set_ylabel("")

                    # Add value labels
                    for container in ax.containers:
                        ax.bar_label(container, fmt="%.3f", padding=3, fontsize=9)
                else:
                    ax.text(
                        0.5,
                        0.5,
                        "No Data",
                        ha="center",
                        va="center",
                        transform=ax.transAxes,
                    )
                    ax.set_title(f"{outcome}", fontsize=11)
            else:
                ax.text(
                    0.5, 0.5, "No Data", ha="center", va="center", transform=ax.transAxes
                )
                ax.set_title(f"{outcome}", fontsize=11)

        # Hide extra subplots
        for j in range(i + 1, len(axes)):
            axes[j].set_visible(False)

        plt.suptitle(
            f"Top {top_n} Algorithms by Mean {metric.upper()} per Outcome",
            fontsize=16,
            fontweight="bold",
        )
        plt.tight_layout()
        plt.show()


[docs]
    def plot_algorithm_stability(
        self, metric: str = "auc", top_n: int = 15, figsize: Tuple[int, int] = (10, 8)
    ) -> None:
        """Plots the stability (standard deviation) of algorithm performance.

        A lower standard deviation indicates more stable and predictable performance
        across different runs and data subsets.

        Args:
            metric (str, optional): The performance metric to evaluate
                stability on. Defaults to 'auc'.
            top_n (int, optional): The number of algorithms to display, ranked
                by stability (lower is better). Defaults to 15.
            figsize (Tuple[int, int], optional): The figure size for the plot.
                Defaults to (10, 8).

        Raises:
            ValueError: If the specified metric is not found in the data.
        """
        if metric not in self.clean_data.columns:
            raise ValueError(f"Metric '{metric}' not found in data")

        # Calculate standard deviation for each algorithm
        stability = (
            self.clean_data.groupby("method_name")[metric]
            .std()
            .sort_values(ascending=True)
        )

        # Select top N most stable
        stability = stability.head(top_n)

        plt.figure(figsize=figsize)
        ax = sns.barplot(
            x=stability.values,
            y=stability.index,
            hue=stability.index,
            orient="h",
            palette="coolwarm_r",
            legend=False,
        )
        ax.set_title(
            f"Top {top_n} Most Stable Algorithms by {metric.upper()}",
            fontsize=14,
            fontweight="bold",
        )
        ax.set_xlabel(
            f"Standard Deviation of {metric.upper()} (Lower is Better)", fontsize=12
        )
        ax.set_ylabel("Algorithm", fontsize=12)

        ax.bar_label(ax.containers[0], fmt="%.4f", padding=3)
        plt.tight_layout()
        plt.show()



[docs]
    def plot_performance_tradeoff(
        self,
        metric_y: str = "auc",
        metric_x: str = "run_time",
        stratify_by_outcome: bool = False,
        top_n_algos: Optional[int] = 10,
        figsize: Tuple[int, int] = (12, 8),
    ) -> None:
        """Plots a performance trade-off scatter plot between two metrics.

        This is useful for visualizing trade-offs like Performance vs. Speed.

        Args:
            metric_y (str, optional): The metric for the y-axis (e.g., 'auc').
                Defaults to 'auc'.
            metric_x (str, optional): The metric for the x-axis (e.g., 'run_time').
                Defaults to 'run_time'.
            stratify_by_outcome (bool, optional): If True, creates a separate
                plot for each outcome. Defaults to False.
            top_n_algos (Optional[int], optional): If set, only shows the top N
                algorithms based on `metric_y`. Defaults to 10.
            figsize (Tuple[int, int], optional): The figure size for the plot.
                Defaults to (12, 8).

        Raises:
            ValueError: If one or both specified metrics are not found.
        """
        if (
            metric_y not in self.clean_data.columns
            or metric_x not in self.clean_data.columns
        ):
            raise ValueError(
                f"One or both metrics ('{metric_y}', '{metric_x}') not found in data."
            )

        plot_data = self.clean_data.copy()

        if top_n_algos:
            top_algos = (
                plot_data.groupby("method_name")[metric_y]
                .mean()
                .nlargest(top_n_algos)
                .index
            )
            plot_data = plot_data[plot_data["method_name"].isin(top_algos)]

        if not stratify_by_outcome:
            plt.figure(figsize=figsize)
            ax = sns.scatterplot(
                data=plot_data,
                x=metric_x,
                y=metric_y,
                hue="method_name",
                style="outcome_variable",
                alpha=0.7,
                s=80,
            )

            if (
                plot_data[metric_x].min() > 0
                and plot_data[metric_x].max() / plot_data[metric_x].min() > 100
            ):
                ax.set_xscale("log")
                plt.xlabel(f'{metric_x.replace("_", " ").title()} (log scale)', fontsize=12)
            else:
                plt.xlabel(metric_x.replace("_", " ").title(), fontsize=12)

            plt.ylabel(metric_y.replace("_", " ").title(), fontsize=12)
            plt.title(
                f"Performance Trade-off: {metric_y.upper()} vs. {metric_x.upper()}",
                fontsize=14,
                fontweight="bold",
            )
            plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", title="Algorithm")
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
        else:
            # FacetGrid for stratified plots
            g = sns.FacetGrid(
                plot_data,
                col="outcome_variable",
                hue="method_name",
                col_wrap=min(3, plot_data["outcome_variable"].nunique()),
                sharex=False,
                sharey=True,
                height=5,
            )
            g.map(sns.scatterplot, metric_x, metric_y, alpha=0.8, s=60)
            g.add_legend(title="Algorithm")
            g.set_titles("{col_name}")
            g.set_axis_labels(
                metric_x.replace("_", " ").title(), metric_y.replace("_", " ").title()
            )
            g.fig.suptitle(
                f"Performance Trade-off by Outcome: {metric_y.upper()} vs. {metric_x.upper()}",
                fontsize=16,
                fontweight="bold",
                y=1.02,
            )
            plt.tight_layout()
            plt.show()



[docs]
    def plot_pareto_front(
        self,
        metric_y: str = "auc",
        metric_x: str = "run_time",
        lower_is_better_x: bool = True,
        figsize: Tuple[int, int] = (12, 8),
    ) -> None:
        """Plots a Pareto front for two competing metrics.

        The Pareto front highlights the set of "optimal" algorithms where you cannot
        improve one metric without degrading the other.

        Args:
            metric_y (str, optional): The primary performance metric (higher is
                better). Defaults to 'auc'.
            metric_x (str, optional): The secondary metric, often a cost
                (e.g., 'run_time'). Defaults to 'run_time'.
            lower_is_better_x (bool, optional): Set to True if a lower value of
                `metric_x` is better. Defaults to True.
            figsize (Tuple[int, int], optional): The figure size for the plot.
                Defaults to (12, 8).
        """
        # 1. Get mean performance for each algorithm
        summary_df = (
            self.clean_data.groupby("method_name")
            .agg(mean_y=(metric_y, "mean"), mean_x=(metric_x, "mean"))
            .reset_index()
        )

        # 2. Identify the Pareto front
        # A point is on the Pareto front if no other point dominates it.
        is_pareto = []
        for i, row in summary_df.iterrows():
            # Check if any other point dominates this one
            # Dominates = better on y AND better on x
            y_is_better = summary_df["mean_y"] > row["mean_y"]
            if lower_is_better_x:
                x_is_better = summary_df["mean_x"] < row["mean_x"]
            else:
                x_is_better = summary_df["mean_x"] > row["mean_x"]

            is_dominated = (y_is_better & x_is_better).any()
            is_pareto.append(not is_dominated)

        summary_df["is_pareto"] = is_pareto
        pareto_df = summary_df[summary_df["is_pareto"]].sort_values("mean_x")

        # 3. Plot
        plt.figure(figsize=figsize)
        sns.scatterplot(
            data=summary_df,
            x="mean_x",
            y="mean_y",
            hue="is_pareto",
            style="is_pareto",
            s=100,
            palette={True: "red", False: "grey"},
            legend=False,
        )

        if not pareto_df.empty:
            plt.plot(pareto_df["mean_x"], pareto_df["mean_y"], "r--", alpha=0.7)

        # Annotate points
        for i, row in summary_df.iterrows():
            plt.text(
                row["mean_x"],
                row["mean_y"] * 1.001,
                row["method_name"],
                fontsize=9,
                ha="left",
                va="bottom",
            )

        plt.title(
            f"Pareto Front: {metric_y.upper()} vs {metric_x.title()}",
            fontsize=14,
            fontweight="bold",
        )
        plt.xlabel(
            f'Mean {metric_x.title()}{" (Lower is Better)" if lower_is_better_x else ""}',
            fontsize=12,
        )
        plt.ylabel(f"Mean {metric_y.upper()} (Higher is Better)", fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()



[docs]
    def plot_statistical_significance_heatmap(
        self,
        metric: str = "auc",
        outcome: Optional[str] = None,
        figsize: Tuple[int, int] = (14, 12),
    ) -> None:
        """Performs pairwise t-tests and visualizes p-values in a heatmap.

        This helps determine if observed performance differences between
        algorithms are statistically significant.

        Args:
            metric (str, optional): The performance metric to compare.
                Defaults to 'auc'.
            outcome (Optional[str], optional): If specified, filters data for a
                single outcome. Otherwise, uses all data. Defaults to None.
            figsize (Tuple[int, int], optional): The figure size for the plot.
                Defaults to (14, 12).

        Raises:
            ValueError: If stratifying and 'outcome_variable' column is missing.
        """
        plot_data = self.clean_data.copy()
        title = f"Pairwise T-test P-values for {metric.upper()}"
        if outcome:
            if "outcome_variable" not in plot_data.columns:
                raise ValueError("outcome_variable column not found for stratified analysis.")
            plot_data = plot_data[plot_data["outcome_variable"] == outcome]
            title += f" (Outcome: {outcome})"

        algorithms = sorted(plot_data["method_name"].unique())
        p_values = pd.DataFrame(np.nan, index=algorithms, columns=algorithms)

        for i, algo1 in enumerate(algorithms):
            for j, algo2 in enumerate(algorithms):
                if i <= j:
                    continue
                data1 = plot_data[plot_data["method_name"] == algo1][metric].dropna()
                data2 = plot_data[plot_data["method_name"] == algo2][metric].dropna()
                if len(data1) > 1 and len(data2) > 1:
                    _, p_val = ttest_ind(data1, data2, equal_var=False, nan_policy="omit")
                    p_values.loc[algo1, algo2] = p_val
                    p_values.loc[algo2, algo1] = p_val

        plt.figure(figsize=figsize)
        sns.heatmap(
            p_values,
            annot=True,
            fmt=".3f",
            cmap="coolwarm_r",
            center=0.05,
            cbar_kws={"label": "P-value"},
        ).reset_index()
        plt.title(title, fontsize=14, fontweight='bold')
        plt.show()