Source code for summarize_results

# summarize_results.py
"""
Module for creating tabular summaries from ML results data.
"""

import warnings

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

from ml_grid.results_processing.core import get_clean_data


[docs] class ResultsSummarizer: """Provides methods to summarize and transform results data into concise DataFrames.""" def __init__(self, data: pd.DataFrame): """Initializes the summarizer. Args: data (pd.DataFrame): Aggregated results DataFrame. Raises: ValueError: If the input data is not a non-empty pandas DataFrame. """ if not isinstance(data, pd.DataFrame) or data.empty: raise ValueError("Input data must be a non-empty pandas DataFrame.")
[docs] self.data = data
[docs] self.clean_data = get_clean_data(data)
[docs] def get_best_model_per_outcome(self, metric: str = "auc") -> pd.DataFrame: """Finds the best model for each outcome and expands the feature list. This method identifies the single best-performing model run for each outcome variable based on the specified metric. It then transforms the 'decoded_features' list into a set of boolean columns, where each new column represents a feature and its value indicates whether that feature was used in the best model run. Args: metric (str, optional): The performance metric to use for determining the "best" model. Defaults to 'auc'. Returns: pd.DataFrame: A DataFrame containing the best model run for each outcome, with additional boolean columns for each feature. """ if "outcome_variable" not in self.clean_data.columns: raise ValueError("Data must contain an 'outcome_variable' column.") if metric not in self.clean_data.columns: raise ValueError(f"Metric '{metric}' not found in the data.") if "decoded_features" not in self.clean_data.columns: raise ValueError( "Data must contain a 'decoded_features' column. " "Ensure ResultsAggregator was run with a feature names CSV." ) # 1. Find the index of the maximum metric value for each outcome group best_indices = self.clean_data.groupby("outcome_variable")[metric].idxmax() best_models_df = self.clean_data.loc[best_indices].copy() # 2. Convert 'decoded_features' into boolean columns # Handle rows where 'decoded_features' might be NaN or not a list feature_lists = best_models_df["decoded_features"].apply( lambda x: x if isinstance(x, list) else [] ) if feature_lists.empty or feature_lists.apply(len).sum() == 0: warnings.warn( "No features found in 'decoded_features' for the best models. Returning summary without feature columns.", stacklevel=2, ) return best_models_df # Use MultiLabelBinarizer to create the feature indicator matrix mlb = MultiLabelBinarizer() feature_indicator_df = pd.DataFrame( mlb.fit_transform(feature_lists), columns=mlb.classes_, index=feature_lists.index, ).astype(bool) # 3. Combine the original best models data with the new feature columns # Use rsuffix to handle cases where a feature name (e.g., 'age') conflicts # with a feature category column name in the main dataframe. result_df = best_models_df.join(feature_indicator_df, rsuffix="_feature") # Drop original list-based columns for clarity and sort result_df = result_df.drop( columns=["f_list", "decoded_features"], errors="ignore" ) return result_df.sort_values(by=metric, ascending=False)