Source code for ml_grid.util.grid_param_space_ga

"""Defines the Grid class for creating a hyperparameter search space for GA."""

import itertools as it
import random
from typing import Dict, Generator, List, Optional, Union

from ml_grid.util.global_params import global_parameters


[docs] class Grid: """Generates and manages a grid of hyperparameter settings for GA experiments."""
[docs] global_params: global_parameters
"""A reference to the global parameters singleton instance."""
[docs] verbose: int
"""The verbosity level, inherited from global parameters."""
[docs] sample_n: int
"""The number of random settings to sample from the full grid."""
[docs] grid: Dict[str, Union[List, Dict]]
""" The dictionary defining the hyperparameter search space for the GA. Keys represent different aspects of the experiment: - **weighted**: The weighting strategy for the ensemble. - **use_stored_base_learners**: Whether to use pre-trained base learners. - **store_base_learners**: Whether to save the base learners after training. - **resample**: The resampling strategy to handle class imbalance. - **scale**: Whether to apply standard scaling to features. - **n_features**: The number of features to use (currently 'all'). - **param_space_size**: The size of the hyperparameter space for base learners. - **n_unique_out**: A parameter for future use. - **outcome_var_n**: The index of the outcome variable to use. - **div_p**: A parameter for future use. - **percent_missing**: The threshold for dropping columns with missing values. - **corr**: The threshold for dropping highly correlated features. - **cxpb**: The crossover probability for the genetic algorithm. - **mutpb**: The mutation probability for the genetic algorithm. - **indpb**: The independent probability for each attribute to be mutated. - **t_size**: The tournament size for selection in the genetic algorithm. - **data**: A nested dictionary specifying which feature categories to include. """
[docs] settings_list: List[Dict]
""" A list of hyperparameter combinations sampled from the `grid`. Each element is a dictionary representing one complete experimental configuration. """
[docs] settings_list_iterator: it.chain
"""An iterator over the `settings_list`.""" def __init__(self, sample_n: Optional[int] = 1000): """Initializes the Grid object for Genetic Algorithms. This class creates a comprehensive grid of settings by taking the Cartesian product of all specified hyperparameters. It then randomly samples a specified number of these settings to create a manageable list for experimentation. Args: sample_n (Optional[int], optional): The number of random settings to sample from the full grid. Defaults to 1000. """ self.global_params = global_parameters self.verbose = self.global_params.verbose if sample_n is None: self.sample_n = 1000 else: self.sample_n = sample_n if self.verbose >= 1: print(f"Feature space slice sample_n {self.sample_n}") # Default grid # User can update grid dictionary on the object self.grid = { "weighted": ["ann", "de", "unweighted"], "use_stored_base_learners": [False], "store_base_learners": [False], "resample": ["undersample", None], "scale": [True], "n_features": ["all"], "param_space_size": ["medium"], "n_unique_out": [10], "outcome_var_n": ["1"], "div_p": [0], "percent_missing": [99.9, 95, 90], # n/100 ex 95 for 95% "corr": [0.8, 0.5], "cxpb": [0.5, 0.75, 0.25], "mutpb": [0.2, 0.4, 0.8], "indpb": [0.025, 0.05, 0.075], "t_size": [3, 6, 9], "data": [ { "age": [True], "sex": [True], "bmi": [True], "ethnicity": [True], "bloods": [True, False], "diagnostic_order": [True, False], "drug_order": [True, False], "annotation_n": [True, False], "meta_sp_annotation_n": [True, False], "annotation_mrc_n": [True, False], "meta_sp_annotation_mrc_n": [True, False], "core_02": [False], "bed": [False], "vte_status": [True], "hosp_site": [True], "core_resus": [False], "news": [False], } ], } def c_prod(d: Union[Dict, List]) -> Generator[Dict, None, None]: """Recursively generates the Cartesian product of a nested dictionary. Args: d (Union[Dict, List]): The dictionary or list of settings. Yields: Generator[Dict, None, None]: A generator of dictionaries, each representing a unique combination of settings. """ if isinstance(d, list): for i in d: yield from ([i] if not isinstance(i, (dict, list)) else c_prod(i)) else: for i in it.product(*map(c_prod, d.values())): yield dict(zip(d.keys(), i)) self.settings_list = list(c_prod(self.grid)) full_settings_size = len(self.settings_list) print(f"Full settings_list size: {full_settings_size}") random.shuffle(self.settings_list) # Ensure sample_n is not greater than the number of available settings sample_size = min(self.sample_n, full_settings_size) if self.sample_n > full_settings_size and self.verbose >= 1: print( f"Warning: sample_n ({self.sample_n}) is larger than the number of settings ({full_settings_size}). Using all settings." ) self.settings_list = random.sample(self.settings_list, sample_size) self.settings_list_iterator = iter(self.settings_list)