Source code for ml_grid.pipeline.main

import traceback
from typing import Any, Dict, List, Tuple

import numpy as np
from catboost import CatBoostError
from ml_grid.pipeline import grid_search_cross_validate
from ml_grid.pipeline.data import pipe
from ml_grid.util.bayes_utils import calculate_combinations
from ml_grid.util.global_params import global_parameters
from sklearn.model_selection import ParameterGrid

[docs] class run: """Orchestrates the hyperparameter search for a list of models."""
[docs] global_params: global_parameters
"""A reference to the global parameters singleton instance."""
[docs] verbose: int
"""The verbosity level for logging, inherited from global parameters."""
[docs] error_raise: bool
"""A flag to control error handling. If True, exceptions will be raised."""
[docs] ml_grid_object: pipe
"""The main data pipeline object, containing data and model configurations."""
[docs] sub_sample_param_space_pct: float
"""The percentage of the parameter space to sample in a randomized search."""
[docs] parameter_space_size: str
"""The size of the parameter space for base learners (e.g., 'medium', 'xsmall')."""
[docs] model_class_list: List[Any]
"""A list of instantiated model class objects to be evaluated in this run."""
[docs] pg_list: List[int]
"""A list containing the calculated size of the parameter grid for each model."""
[docs] mean_parameter_space_val: float
"""The mean size of the parameter spaces across all models in the run."""
[docs] sub_sample_parameter_val: int
"""The calculated number of iterations for randomized search, based on `sub_sample_param_space_pct`."""
[docs] arg_list: List[Tuple]
"""A list of argument tuples, one for each model, to be passed to the grid search function."""
[docs] multiprocess: bool
"""A flag to enable or disable multiprocessing for running grid searches in parallel."""
[docs] local_param_dict: Dict[str, Any]
"""A dictionary of parameters for the current experimental run."""
[docs] model_error_list: List[List[Any]]
"""A list to store details of any errors encountered during model training."""
[docs] highest_score: float
"""The highest score achieved across all successful model runs in the execute step.""" def __init__(self, ml_grid_object: pipe, local_param_dict: Dict[str, Any]): """Initializes the run class. This class takes the main data pipeline object and a dictionary of local parameters to set up and prepare for executing a series of hyperparameter searches across multiple machine learning models. Args: ml_grid_object (pipe): The main data pipeline object, which contains the data (X_train, y_train, etc.) and a list of model classes to be evaluated. local_param_dict (Dict[str, Any]): A dictionary of parameters for the current experimental run, such as `param_space_size`. """ self.global_params = global_parameters self.verbose = self.global_params.verbose self.error_raise = self.global_params.error_raise self.ml_grid_object = ml_grid_object self.sub_sample_param_space_pct = self.global_params.sub_sample_param_space_pct self.parameter_space_size = local_param_dict.get("param_space_size") self.model_class_list = ml_grid_object.model_class_list if self.verbose >= 2: print(f"{len(self.model_class_list)} models loaded") self.pg_list = [] for elem in self.model_class_list: if not self.global_params.bayessearch: pg = ParameterGrid(elem.parameter_space) pg = len(pg) else: # Handle list of parameter spaces , example log reg pg = calculate_combinations(elem.parameter_space, steps=10) #pg = ParameterGrid(elem.parameter_space) self.pg_list.append(pg) if self.verbose >= 1: print(f"{elem.method_name}:{pg}") for param in elem.parameter_space: if self.global_params.bayessearch is False: try: if type(param) is not list: if ( isinstance(elem.parameter_space.get(param), list) is False and isinstance(elem.parameter_space.get(param), np.ndarray) is False ): print("What is this?") print( f"{elem.method_name, param} {type(elem.parameter_space.get(param))}" ) except Exception as e: # print(e) pass #validate bayes params? # sample from mean of all param space n self.mean_parameter_space_val = np.mean(self.pg_list) self.sub_sample_parameter_val = int( self.sub_sample_param_space_pct * self.mean_parameter_space_val ) # n_iter_v = int(sub_sample_param_space_pct * len(ParameterGrid(parameter_space))) self.arg_list = [] for model_class in self.model_class_list: class_name = model_class self.arg_list.append( ( class_name.algorithm_implementation, class_name.parameter_space, class_name.method_name, self.ml_grid_object, self.sub_sample_parameter_val, ) ) self.multiprocess = False self.local_param_dict = local_param_dict if self.verbose >= 2: print(f"Passed main init, len(arg_list): {len(self.arg_list)}")
[docs] def execute(self) -> Tuple[List[List[Any]], float]: """Executes the grid search for each model in the list. This method iterates through the list of configured models and their parameter spaces, running a cross-validated grid search for each one. It captures any errors that occur during the process and returns a list of those errors along with the highest score achieved. Returns: Tuple[List[List[Any]], float]: A tuple containing: - A list of model errors, where each error is a list containing the algorithm instance, the exception, and the traceback. - The highest score achieved across all successful model runs. """ self.model_error_list = [] self.highest_score = 0 highest_score = 0 # for optimisation if self.multiprocess: def multi_run_wrapper(args: Tuple) -> Any: print("not implemented ") # return grid_search_cross_validate(*args) if __name__ == "__main__": from multiprocessing import Pool pool = Pool(8) results = pool.map(multi_run_wrapper, self.arg_list) # print(results) pool.close() # exp elif self.multiprocess == False: for k in range(0, len(self.arg_list)): try: print("grid searching...") res = grid_search_cross_validate.grid_search_crossvalidate( *self.arg_list[k] # algorithm_implementation = LogisticRegression_class(parameter_space_size=self.parameter_space_size).algorithm_implementation, parameter_space = self.arg_list[k][1], method_name=self.arg_list[k][2], X = self.arg_list[k][3], y=self.arg_list[k][4] ).grid_search_cross_validate_score_result self.highest_score = max(self.highest_score, res) print(f"highest score: {highest_score}") except CatBoostError as e: print(f"CatBoostError: {e}") print(f"continuing despite catboost error...") except Exception as e: print(e) print("error on ", self.arg_list[k][2]) self.model_error_list.append( [self.arg_list[k][0], e, traceback.print_exc()] ) if self.error_raise: raise e res = input( "error thrown in grid_search_crossvalidate on model class list, input pass to pass else raise" ) if res == "pass": continue else: raise e print( f"Model error list: nb. errors returned from func: {self.model_error_list}" ) print(self.model_error_list) # return highest score from run for additional optimisation: return self.model_error_list, self.highest_score