Source code for ml_grid.model_classes.gradientboosting_classifier_class

"""Gradient Boosting Classifier.

This module contains the GradientBoostingClassifier_class, which is a configuration
class for the GradientBoostingClassifier. It provides parameter spaces for
grid search and Bayesian optimization.
"""

from typing import Any, Dict, Optional

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from skopt.space import Categorical, Integer, Real

from ml_grid.util import param_space
from ml_grid.util.global_params import global_parameters


[docs] class GradientBoostingClassifierClass: """GradientBoostingClassifier with support for both Bayesian and non-Bayesian parameter spaces.""" def __init__( self, X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, parameter_space_size: Optional[str] = None, ): """Initializes the GradientBoostingClassifierClass. Args: X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None. y (Optional[pd.Series]): Target vector for training. Defaults to None. parameter_space_size (Optional[str]): Size of the parameter space for optimization. Defaults to None. Raises: ValueError: If `parameter_space_size` is not a valid key (though current implementation does not explicitly raise this). """ global_params = global_parameters
[docs] self.X: Optional[pd.DataFrame] = X
[docs] self.y: Optional[pd.Series] = y
# Use the standard GradientBoostingClassifier directly
[docs] self.algorithm_implementation: GradientBoostingClassifier = ( GradientBoostingClassifier() )
[docs] self.method_name: str = "GradientBoostingClassifier"
# Define the parameter vector space
[docs] self.parameter_vector_space: param_space.ParamSpace = param_space.ParamSpace( parameter_space_size )
[docs] self.parameter_space: Dict[str, Any]
if global_params.bayessearch: # Define the parameter space for Bayesian optimization self.parameter_space = { "ccp_alpha": Real(0.0, 1.0, prior="uniform"), "criterion": Categorical(["friedman_mse"]), "init": Categorical([None]), "learning_rate": Real(1e-5, 1e-1, prior="log-uniform"), "loss": Categorical(["log_loss", "exponential"]), "max_depth": Integer(2, 10), "max_features": Categorical(["sqrt", "log2"]), "max_leaf_nodes": Integer(10, 100), "min_impurity_decrease": Real(0.0, 0.5, prior="uniform"), "min_samples_leaf": Integer(1, 20), "min_samples_split": Integer(2, 20), "min_weight_fraction_leaf": Real(0.0, 0.5, prior="uniform"), "n_estimators": Integer(50, 500), "n_iter_no_change": Integer(5, 20), "subsample": Real(0.1, 1.0, prior="uniform"), "tol": Real(1e-5, 1e-1, prior="log-uniform"), "validation_fraction": Real(0.1, 0.3, prior="uniform"), "verbose": Categorical([0]), "warm_start": Categorical([False]), } else: # Define the parameter space for traditional grid search self.parameter_space = { "ccp_alpha": list( self.parameter_vector_space.param_dict.get("log_small") ), "criterion": ["friedman_mse"], "init": [None], "learning_rate": list( self.parameter_vector_space.param_dict.get("log_small") ), "loss": ["log_loss", "exponential"], # "max_depth": list(range(2, 11)), # Uncomment if needed "max_features": ["sqrt", "log2"], # "max_leaf_nodes": list(range(10, 1001)), # Uncomment if needed # "min_impurity_decrease": list(self.parameter_vector_space.param_dict.get("log_small")), # Uncomment if needed # "min_samples_leaf": list(range(1, 11)), # Uncomment if needed # "min_samples_split": list(range(2, 21)), # Uncomment if needed # "min_weight_fraction_leaf": np.linspace(0.0, 0.5, 6).tolist(), # Uncomment if needed "n_estimators": list( self.parameter_vector_space.param_dict.get("log_large_long") ), # "n_iter_no_change": list(range(5, 51)), # Uncomment if needed "subsample": list( np.delete( self.parameter_vector_space.param_dict.get("lin_zero_one"), 0 ) ), "tol": list(self.parameter_vector_space.param_dict.get("log_small")), "validation_fraction": [0.1], "verbose": [0], "warm_start": [False], }