Source code for ml_grid.pipeline.hyperparameter_search

import logging
import warnings
from typing import Any, Dict, List, Union
import numpy as np
import joblib

import pandas as pd
import tensorflow as tf
from sklearn.base import BaseEstimator, is_classifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.utils import point_asdict
from ml_grid.model_classes.AutoKerasClassifierWrapper import AutoKerasClassifierWrapper
from ml_grid.model_classes.FLAMLClassifierWrapper import FLAMLClassifierWrapper
from ml_grid.model_classes.H2OAutoMLClassifier import H2OAutoMLClassifier
from ml_grid.model_classes.H2ODeepLearningClassifier import H2ODeepLearningClassifier
from ml_grid.model_classes.H2ODRFClassifier import H2ODRFClassifier
from ml_grid.model_classes.H2OGAMClassifier import H2OGAMClassifier
from ml_grid.model_classes.H2OGBMClassifier import H2OGBMClassifier
from ml_grid.model_classes.H2OGLMClassifier import H2OGLMClassifier
from ml_grid.model_classes.H2ONaiveBayesClassifier import H2ONaiveBayesClassifier
from ml_grid.model_classes.H2ORuleFitClassifier import H2ORuleFitClassifier
from ml_grid.model_classes.H2OStackedEnsembleClassifier import (
    H2OStackedEnsembleClassifier,
)
from ml_grid.model_classes.H2OXGBoostClassifier import H2OXGBoostClassifier
from ml_grid.model_classes.keras_classifier_class import KerasClassifierClass

# from ml_grid.model_classes.knn_wrapper_class import KNNWrapper
from ml_grid.model_classes.NeuralNetworkKerasClassifier import NeuralNetworkClassifier
from ml_grid.util.global_params import global_parameters
from ml_grid.util.validate_parameters import validate_parameters_helper


[docs] class PatchedBayesSearchCV(BayesSearchCV): def _step( self, search_space, optimizer, score_name=None, evaluate_candidates=None, n_points=1, ): """ A patched version of _step to handle non-scalar Categorical parameters. This is a copy of the original _step method from an older skopt version, with the problematic line that causes `ValueError: can only convert an array of size 1 to a Python scalar` removed. """ # get parameter values to evaluate params = optimizer.ask(n_points=n_points) # The problematic line `params = [[np.array(v).item() for v in p] for p in params]` # is removed here to support non-scalar parameter values like tuples. # make lists into dictionaries params_dict = [point_asdict(search_space, p) for p in params] # Convert numpy types to native Python types to avoid H2OTypeError for i in range(len(params_dict)): for k, v in params_dict[i].items(): if hasattr(v, "item"): params_dict[i][k] = v.item() # evaluate all candidates all_results = evaluate_candidates(params_dict) # Feed the point and score to the optimizer # We should feed the score of the refit metric to the optimizer. # The `multimetric_` attribute may not be present in all versions. # A reliable way to check for multimetric scoring is to see if `scoring` # was provided as a dictionary. if isinstance(self.scoring, dict): # Always use self.refit to get the base metric name (e.g., 'auc'). # The `score_name` argument can be polluted in older skopt versions # on subsequent iterations of the search loop. metric_name = self.refit mean_test_score = all_results[f"mean_test_{metric_name}"] else: mean_test_score = all_results["mean_test_score"] # Coerce scores to a 1D numpy array of floats to prevent type/shape errors. # This handles scalars, lists, and nested lists. scores_arr = np.asarray(mean_test_score, dtype=float).flatten() # skopt optimizer minimizes the function so we negate the score y_tell = (-scores_arr).tolist() # WORKAROUND: The batch `tell` method in older skopt versions can be buggy # and corrupt the optimizer's internal state (Xi, yi), leading to an # IndexError. To avoid this, we feed the points to the optimizer one # by one. The `fit` parameter is set to False for all but the last # point to ensure the model is fitted only after all points in the # batch are told. if params: # Tell all but the last point without fitting the model for i in range(len(params) - 1): optimizer.tell(params[i], y_tell[i], fit=False) # Tell the last point and trigger the model fit optimizer.tell(params[-1], y_tell[-1], fit=True) # Pack results into a dictionary results = { "params": params, "mean_test_score": mean_test_score, "all_results": all_results, } # The calling `_run_search` loop expects a score_name back. We return # the base metric name to avoid polluting the `score_name` variable # in the parent loop. return results, self.refit if self.refit else "score"
[docs] class HyperparameterSearch: """Orchestrates hyperparameter search using GridSearchCV, RandomizedSearchCV, or BayesSearchCV."""
[docs] algorithm: BaseEstimator
"""The scikit-learn compatible estimator instance."""
[docs] parameter_space: Union[Dict, List[Dict]]
"""The hyperparameter search space."""
[docs] method_name: str
"""The name of the algorithm."""
[docs] global_params: global_parameters
"""A reference to the global parameters singleton instance."""
[docs] sub_sample_pct: int
""" Percentage of the parameter space to sample for randomized search. Defaults to 100. """
[docs] max_iter: int
""" The maximum number of iterations for randomized or Bayesian search. Defaults to 100. """
[docs] ml_grid_object: Any
"""The main pipeline object containing data and other parameters.""" def __init__( self, algorithm: BaseEstimator, parameter_space: Union[Dict, List[Dict]], method_name: str, global_params: Any, sub_sample_pct: int = 100, max_iter: int = 100, ml_grid_object: Any = None, cv: Any = None, ): """Initializes the HyperparameterSearch class. Args: algorithm (BaseEstimator): The scikit-learn compatible estimator instance. parameter_space (Union[Dict, List[Dict]]): The hyperparameter search space. method_name (str): The name of the algorithm. global_params (Any): The global parameters object. sub_sample_pct (int, optional): Percentage of the parameter space to sample for randomized search. Defaults to 100. max_iter (int, optional): The maximum number of iterations for randomized or Bayesian search. Defaults to 100. ml_grid_object (Any, optional): The main pipeline object containing data and other parameters. Defaults to None. cv (Any, optional): Cross-validation splitting strategy. Can be None, int, or a CV splitter. Defaults to None (no cross-validation). """ self.algorithm = algorithm self.parameter_space = parameter_space self.method_name = method_name self.global_params = global_params self.sub_sample_pct = sub_sample_pct self.max_iter = max_iter self.ml_grid_object = ml_grid_object
[docs] self.cv = cv
if self.ml_grid_object is None: raise ValueError("ml_grid_object is required.") # Custom wrappers that might not be recognized by is_classifier custom_classifier_types = ( # KNNWrapper, H2OAutoMLClassifier, H2OGBMClassifier, H2ODRFClassifier, H2OGAMClassifier, H2ODeepLearningClassifier, H2OGLMClassifier, H2ONaiveBayesClassifier, H2ORuleFitClassifier, H2OXGBoostClassifier, H2OStackedEnsembleClassifier, NeuralNetworkClassifier, # type: ignore KerasClassifierClass, ) # Check if it's a valid classifier is_valid = ( is_classifier(self.algorithm) or isinstance(self.algorithm, custom_classifier_types) or (hasattr(self.algorithm, "fit") and hasattr(self.algorithm, "predict")) ) if not is_valid: raise ValueError( f"The provided algorithm is not a valid classifier. " f"Received type: {type(self.algorithm)}" ) # Configure warnings warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings( "ignore", category=RuntimeWarning ) # Suppress divide by zero warnings from NaiveBayes # Configure GPUs if applicable if ( "keras" in method_name.lower() or "xgb" in method_name.lower() or "catboost" in method_name.lower() ): self._configure_gpu() def _configure_gpu(self) -> None: """Configures TensorFlow to use GPU with memory growth enabled.""" try: logger = logging.getLogger("ml_grid") gpu_devices = tf.config.experimental.list_physical_devices("GPU") for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) except Exception as e: logger.warning(f"Could not configure GPU for TensorFlow: {e}")