# plot_algorithms.py
"""
Algorithm comparison plotting module for ML results analysis.
Focuses on comparing algorithm performance with outcome variable stratification.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from typing import List, Dict, Optional, Union, Tuple
from ml_grid.results_processing.core import get_clean_data
import warnings
# Maximum number of outcomes to display in stratified plots to avoid clutter.
[docs]
MAX_OUTCOMES_FOR_STRATIFIED_PLOT = 20
[docs]
class AlgorithmComparisonPlotter:
"""A class for creating algorithm comparison visualizations."""
def __init__(self, data: pd.DataFrame):
"""Initializes the AlgorithmComparisonPlotter.
Args:
data (pd.DataFrame): A DataFrame containing the experiment results.
"""
[docs]
self.clean_data = get_clean_data(data)
# Set style
plt.style.use("default")
sns.set_palette("husl")
[docs]
def plot_algorithm_boxplots(
self,
metric: str = "auc",
algorithms_to_plot: Optional[List[str]] = None,
stratify_by_outcome: bool = False,
outcomes_to_plot: Optional[List[str]] = None,
figsize: Tuple[int, int] = (12, 6),
) -> None:
"""Creates box plots comparing algorithm performance.
Args:
metric (str, optional): The performance metric to compare.
Defaults to 'auc'.
algorithms_to_plot (Optional[List[str]], optional): A list of
specific algorithms to include. If None, all are used.
Defaults to None.
stratify_by_outcome (bool, optional): If True, creates separate
plots for each outcome. Defaults to False.
outcomes_to_plot (Optional[List[str]], optional): A list of
specific outcomes to plot. If None, all are used.
Defaults to None.
figsize (Tuple[int, int], optional): The figure size.
Defaults to (12, 6).
Raises:
ValueError: If the specified metric is not found in the data.
"""
if metric not in self.clean_data.columns:
raise ValueError(f"Metric '{metric}' not found in data")
if not stratify_by_outcome:
self._plot_single_algorithm_boxplot(metric, algorithms_to_plot, figsize)
else:
self._plot_stratified_algorithm_boxplots(
metric, algorithms_to_plot, outcomes_to_plot, figsize
)
def _plot_single_algorithm_boxplot(
self,
metric: str,
algorithms_to_plot: Optional[List[str]],
figsize: Tuple[int, int],
) -> None:
"""Create single box plot for all outcomes combined."""
plot_data = self.clean_data.copy()
if algorithms_to_plot:
plot_data = plot_data[plot_data["method_name"].isin(algorithms_to_plot)]
plt.figure(figsize=figsize)
# Create box plot
sns.boxplot(
data=plot_data, x="method_name", y=metric, showfliers=True, whis=1.5
)
# Add mean markers
for i, algo in enumerate(plot_data["method_name"].unique()):
algo_data = plot_data[plot_data["method_name"] == algo][metric]
mean_val = algo_data.mean()
plt.scatter(
i, mean_val, color="red", s=100, marker="D", zorder=10, label="Mean" if i == 0 else ""
)
plt.xticks(rotation=45, ha="right")
plt.xlabel("Algorithm", fontsize=12)
plt.ylabel(metric.upper(), fontsize=12)
plt.title(
f"{metric.upper()} Performance by Algorithm - All Outcomes",
fontsize=14,
fontweight="bold",
)
plt.grid(True, alpha=0.3)
plt.legend()
# Add sample size annotations
for i, algo in enumerate(plot_data["method_name"].unique()):
n_samples = len(plot_data[plot_data["method_name"] == algo])
plt.text(
i,
plt.ylim()[0],
f"n={n_samples}",
ha="center",
va="top",
fontsize=9,
fontweight="bold",
)
plt.tight_layout()
plt.show()
def _plot_stratified_algorithm_boxplots(
self,
metric: str,
algorithms_to_plot: Optional[List[str]],
outcomes_to_plot: Optional[List[str]],
figsize: Tuple[int, int],
) -> None:
"""Create stratified box plots by outcome variable."""
if "outcome_variable" not in self.clean_data.columns:
raise ValueError("outcome_variable column not found for stratification")
outcomes = outcomes_to_plot or sorted(
self.clean_data["outcome_variable"].unique()
)
if len(outcomes) > MAX_OUTCOMES_FOR_STRATIFIED_PLOT:
warnings.warn(
f"Found {len(outcomes)} outcomes, which is more than the display limit of {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
f"Displaying the first {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
"Use the 'outcomes_to_plot' parameter to select specific outcomes.",
stacklevel=2
)
outcomes = outcomes[:MAX_OUTCOMES_FOR_STRATIFIED_PLOT]
n_outcomes = len(outcomes)
# Calculate subplot layout
cols = min(3, n_outcomes)
rows = (n_outcomes + cols - 1) // cols
fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 4))
if n_outcomes == 1:
axes = [axes]
elif rows == 1:
axes = [axes] if cols == 1 else axes
else:
axes = axes.flatten()
for i, outcome in enumerate(outcomes):
ax = axes[i]
outcome_data = self.clean_data[
self.clean_data["outcome_variable"] == outcome
]
if algorithms_to_plot:
outcome_data = outcome_data[
outcome_data["method_name"].isin(algorithms_to_plot)
]
if len(outcome_data) > 0:
sns.boxplot(data=outcome_data, x="method_name", y=metric, ax=ax)
# Add means
for j, algo in enumerate(outcome_data["method_name"].unique()):
algo_data = outcome_data[outcome_data["method_name"] == algo][
metric
]
if len(algo_data) > 0:
mean_val = algo_data.mean()
ax.scatter(
j, mean_val, color="red", s=60, marker="D", zorder=10
)
ax.tick_params(axis="x", rotation=45)
plt.setp(ax.get_xticklabels(), ha="right")
ax.set_title(
f"{outcome}\n{metric.upper()}", fontsize=11, fontweight="bold"
)
ax.set_xlabel("Algorithm" if i >= len(outcomes) - cols else "")
ax.set_ylabel(metric.upper() if i % cols == 0 else "")
ax.grid(True, alpha=0.3)
else:
ax.text(
0.5,
0.5,
"No Data",
transform=ax.transAxes,
ha="center",
va="center",
fontsize=12,
)
ax.set_title(f"{outcome}", fontsize=11)
# Hide extra subplots
for j in range(i + 1, len(axes)):
axes[j].set_visible(False)
plt.suptitle(
f"{metric.upper()} Performance by Algorithm and Outcome",
fontsize=16,
fontweight="bold",
)
plt.tight_layout()
plt.show()
[docs]
def plot_algorithm_ranking(
self,
metric: str = "auc",
algorithms_to_plot: Optional[List[str]] = None,
stratify_by_outcome: bool = False,
outcomes_to_plot: Optional[List[str]] = None,
top_n: int = 10,
figsize: Tuple[int, int] = (10, 8),
) -> None:
"""Plots a ranked bar chart of algorithm performance.
Args:
metric (str, optional): The performance metric to rank by.
Defaults to 'auc'.
algorithms_to_plot (Optional[List[str]], optional): A list of
specific algorithms to include. Defaults to None.
stratify_by_outcome (bool, optional): If True, creates separate
plots for each outcome. Defaults to False.
outcomes_to_plot (Optional[List[str]], optional): A list of
specific outcomes to plot when stratified. Defaults to None.
top_n (int, optional): The number of top algorithms to display.
Defaults to 10.
figsize (Tuple[int, int], optional): The figure size.
Defaults to (10, 8).
Raises:
ValueError: If the specified metric is not found, or if stratifying
and 'outcome_variable' column is missing.
"""
if metric not in self.clean_data.columns:
raise ValueError(f"Metric '{metric}' not found in data")
if not stratify_by_outcome:
self._plot_single_ranking(metric, algorithms_to_plot, top_n, figsize)
else:
if "outcome_variable" not in self.clean_data.columns:
raise ValueError("outcome_variable column not found for stratification")
self._plot_stratified_ranking(
metric, algorithms_to_plot, outcomes_to_plot, top_n, figsize
)
def _plot_single_ranking(
self,
metric: str,
algorithms_to_plot: Optional[List[str]],
top_n: int,
figsize: Tuple[int, int],
) -> None:
"""Plot a single ranked bar chart for all outcomes combined."""
plot_data = self.clean_data.copy()
if algorithms_to_plot:
plot_data = plot_data[plot_data["method_name"].isin(algorithms_to_plot)]
# Calculate mean performance for each algorithm
ranking = (
plot_data.groupby("method_name")[metric].mean().sort_values(ascending=False)
)
# Select top N
ranking = ranking.head(top_n)
plt.figure(figsize=figsize)
ax = sns.barplot(
x=ranking.values,
y=ranking.index,
hue=ranking.index,
orient="h",
palette="viridis",
legend=False,
)
ax.set_title(
f"Top {top_n} Algorithms by Mean {metric.upper()} - All Outcomes",
fontsize=14,
fontweight="bold",
)
ax.set_xlabel(f"Mean {metric.upper()}", fontsize=12)
ax.set_ylabel("Algorithm", fontsize=12)
# Add value labels to bars
for container in ax.containers:
ax.bar_label(container, fmt="%.3f", padding=3)
plt.tight_layout()
plt.show()
def _plot_stratified_ranking(
self,
metric: str,
algorithms_to_plot: Optional[List[str]],
outcomes_to_plot: Optional[List[str]],
top_n: int,
figsize: Tuple[int, int],
) -> None:
"""Plot stratified ranking bar charts by outcome."""
outcomes = outcomes_to_plot or sorted(
self.clean_data["outcome_variable"].unique()
)
if len(outcomes) > MAX_OUTCOMES_FOR_STRATIFIED_PLOT:
warnings.warn(
f"Found {len(outcomes)} outcomes, which is more than the display limit of {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
f"Displaying the first {MAX_OUTCOMES_FOR_STRATIFIED_PLOT}. "
"Use the 'outcomes_to_plot' parameter to select specific outcomes.",
stacklevel=2,
)
outcomes = outcomes[:MAX_OUTCOMES_FOR_STRATIFIED_PLOT]
n_outcomes = len(outcomes)
cols = min(2, n_outcomes)
rows = (n_outcomes + cols - 1) // cols
fig, axes = plt.subplots(
rows, cols, figsize=(cols * 7, rows * 5), squeeze=False
)
axes = axes.flatten()
for i, outcome in enumerate(outcomes):
ax = axes[i]
outcome_data = self.clean_data[
self.clean_data["outcome_variable"] == outcome
]
if algorithms_to_plot:
outcome_data = outcome_data[
outcome_data["method_name"].isin(algorithms_to_plot)
]
if len(outcome_data) > 0:
ranking = (
outcome_data.groupby("method_name")[metric]
.mean()
.sort_values(ascending=False)
.head(top_n)
)
if not ranking.empty:
sns.barplot(
x=ranking.values,
y=ranking.index,
hue=ranking.index,
orient="h",
ax=ax,
palette="plasma",
legend=False,
)
ax.set_title(
f"{outcome} - Top {min(top_n, len(ranking))} Algorithms",
fontsize=11,
fontweight="bold",
)
ax.set_xlabel(f"Mean {metric.upper()}")
ax.set_ylabel("")
# Add value labels
for container in ax.containers:
ax.bar_label(container, fmt="%.3f", padding=3, fontsize=9)
else:
ax.text(
0.5,
0.5,
"No Data",
ha="center",
va="center",
transform=ax.transAxes,
)
ax.set_title(f"{outcome}", fontsize=11)
else:
ax.text(
0.5, 0.5, "No Data", ha="center", va="center", transform=ax.transAxes
)
ax.set_title(f"{outcome}", fontsize=11)
# Hide extra subplots
for j in range(i + 1, len(axes)):
axes[j].set_visible(False)
plt.suptitle(
f"Top {top_n} Algorithms by Mean {metric.upper()} per Outcome",
fontsize=16,
fontweight="bold",
)
plt.tight_layout()
plt.show()
[docs]
def plot_algorithm_stability(
self, metric: str = "auc", top_n: int = 15, figsize: Tuple[int, int] = (10, 8)
) -> None:
"""Plots the stability (standard deviation) of algorithm performance.
A lower standard deviation indicates more stable and predictable performance
across different runs and data subsets.
Args:
metric (str, optional): The performance metric to evaluate
stability on. Defaults to 'auc'.
top_n (int, optional): The number of algorithms to display, ranked
by stability (lower is better). Defaults to 15.
figsize (Tuple[int, int], optional): The figure size for the plot.
Defaults to (10, 8).
Raises:
ValueError: If the specified metric is not found in the data.
"""
if metric not in self.clean_data.columns:
raise ValueError(f"Metric '{metric}' not found in data")
# Calculate standard deviation for each algorithm
stability = (
self.clean_data.groupby("method_name")[metric]
.std()
.sort_values(ascending=True)
)
# Select top N most stable
stability = stability.head(top_n)
plt.figure(figsize=figsize)
ax = sns.barplot(
x=stability.values,
y=stability.index,
hue=stability.index,
orient="h",
palette="coolwarm_r",
legend=False,
)
ax.set_title(
f"Top {top_n} Most Stable Algorithms by {metric.upper()}",
fontsize=14,
fontweight="bold",
)
ax.set_xlabel(
f"Standard Deviation of {metric.upper()} (Lower is Better)", fontsize=12
)
ax.set_ylabel("Algorithm", fontsize=12)
ax.bar_label(ax.containers[0], fmt="%.4f", padding=3)
plt.tight_layout()
plt.show()
[docs]
def plot_pareto_front(
self,
metric_y: str = "auc",
metric_x: str = "run_time",
lower_is_better_x: bool = True,
figsize: Tuple[int, int] = (12, 8),
) -> None:
"""Plots a Pareto front for two competing metrics.
The Pareto front highlights the set of "optimal" algorithms where you cannot
improve one metric without degrading the other.
Args:
metric_y (str, optional): The primary performance metric (higher is
better). Defaults to 'auc'.
metric_x (str, optional): The secondary metric, often a cost
(e.g., 'run_time'). Defaults to 'run_time'.
lower_is_better_x (bool, optional): Set to True if a lower value of
`metric_x` is better. Defaults to True.
figsize (Tuple[int, int], optional): The figure size for the plot.
Defaults to (12, 8).
"""
# 1. Get mean performance for each algorithm
summary_df = (
self.clean_data.groupby("method_name")
.agg(mean_y=(metric_y, "mean"), mean_x=(metric_x, "mean"))
.reset_index()
)
# 2. Identify the Pareto front
# A point is on the Pareto front if no other point dominates it.
is_pareto = []
for i, row in summary_df.iterrows():
# Check if any other point dominates this one
# Dominates = better on y AND better on x
y_is_better = summary_df["mean_y"] > row["mean_y"]
if lower_is_better_x:
x_is_better = summary_df["mean_x"] < row["mean_x"]
else:
x_is_better = summary_df["mean_x"] > row["mean_x"]
is_dominated = (y_is_better & x_is_better).any()
is_pareto.append(not is_dominated)
summary_df["is_pareto"] = is_pareto
pareto_df = summary_df[summary_df["is_pareto"]].sort_values("mean_x")
# 3. Plot
plt.figure(figsize=figsize)
sns.scatterplot(
data=summary_df,
x="mean_x",
y="mean_y",
hue="is_pareto",
style="is_pareto",
s=100,
palette={True: "red", False: "grey"},
legend=False,
)
if not pareto_df.empty:
plt.plot(pareto_df["mean_x"], pareto_df["mean_y"], "r--", alpha=0.7)
# Annotate points
for i, row in summary_df.iterrows():
plt.text(
row["mean_x"],
row["mean_y"] * 1.001,
row["method_name"],
fontsize=9,
ha="left",
va="bottom",
)
plt.title(
f"Pareto Front: {metric_y.upper()} vs {metric_x.title()}",
fontsize=14,
fontweight="bold",
)
plt.xlabel(
f'Mean {metric_x.title()}{" (Lower is Better)" if lower_is_better_x else ""}',
fontsize=12,
)
plt.ylabel(f"Mean {metric_y.upper()} (Higher is Better)", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
[docs]
def plot_statistical_significance_heatmap(
self,
metric: str = "auc",
outcome: Optional[str] = None,
figsize: Tuple[int, int] = (14, 12),
) -> None:
"""Performs pairwise t-tests and visualizes p-values in a heatmap.
This helps determine if observed performance differences between
algorithms are statistically significant.
Args:
metric (str, optional): The performance metric to compare.
Defaults to 'auc'.
outcome (Optional[str], optional): If specified, filters data for a
single outcome. Otherwise, uses all data. Defaults to None.
figsize (Tuple[int, int], optional): The figure size for the plot.
Defaults to (14, 12).
Raises:
ValueError: If stratifying and 'outcome_variable' column is missing.
"""
plot_data = self.clean_data.copy()
title = f"Pairwise T-test P-values for {metric.upper()}"
if outcome:
if "outcome_variable" not in plot_data.columns:
raise ValueError("outcome_variable column not found for stratified analysis.")
plot_data = plot_data[plot_data["outcome_variable"] == outcome]
title += f" (Outcome: {outcome})"
algorithms = sorted(plot_data["method_name"].unique())
p_values = pd.DataFrame(np.nan, index=algorithms, columns=algorithms)
for i, algo1 in enumerate(algorithms):
for j, algo2 in enumerate(algorithms):
if i <= j:
continue
data1 = plot_data[plot_data["method_name"] == algo1][metric].dropna()
data2 = plot_data[plot_data["method_name"] == algo2][metric].dropna()
if len(data1) > 1 and len(data2) > 1:
_, p_val = ttest_ind(data1, data2, equal_var=False, nan_policy="omit")
p_values.loc[algo1, algo2] = p_val
p_values.loc[algo2, algo1] = p_val
plt.figure(figsize=figsize)
sns.heatmap(
p_values,
annot=True,
fmt=".3f",
cmap="coolwarm_r",
center=0.05,
cbar_kws={"label": "P-value"},
).reset_index()
plt.title(title, fontsize=14, fontweight='bold')
plt.show()