Source code for ml_grid.model_classes.xgb_classifier_class

"""XGBoost Classifier.

This module contains the XGB_class_class, which is a configuration
class for the XGBClassifier. It provides parameter spaces for
grid search and Bayesian optimization.
"""

import logging
from typing import Any, Dict, Optional, Union

import pandas as pd
import xgboost as xgb
from skopt.space import Categorical, Integer, Real

from ml_grid.util import param_space
from ml_grid.util.global_params import global_parameters

logging.getLogger("ml_grid").debug("Imported XGB class")


[docs] class XGBClassifierClass: """XGBoost classifier with support for Bayesian and Grid Search parameter spaces.""" def __init__( self, X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, parameter_space_size: Optional[str] = None, ): """Initializes the XGBClassifierClass. The XGB_class_class wraps the XGBoost classifier algorithm, allowing for easy configuration and use within a grid search or Bayesian optimization framework by setting up a customizable parameter space. Args: X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None. y (Optional[pd.Series]): Target vector for training. Defaults to None. parameter_space_size (Optional[str]): Size of the parameter space for optimization. Defaults to None. Raises: ValueError: If `parameter_space_size` is not a valid key (though current implementation does not explicitly raise this). """
[docs] self.X: Optional[pd.DataFrame] = X
[docs] self.y: Optional[pd.Series] = y
# Initialize the algorithm implementation using XGBClassifier
[docs] self.algorithm_implementation: xgb.XGBClassifier = xgb.XGBClassifier()
[docs] self.method_name: str = "XGBClassifier"
# Initialize the parameter space handler
[docs] self.parameter_vector_space: param_space.ParamSpace = param_space.ParamSpace( parameter_space_size )
[docs] self.parameter_space: Dict[str, Any]
# Patch max_bin dynamically for compatibility def patch_max_bin(param_value: Any) -> Union[int, Real, Integer, Any]: """Ensures the 'max_bin' parameter is >= 2. XGBoost's 'max_bin' parameter must be at least 2. This function patches the provided value to meet this requirement, handling integers and skopt space objects. Args: param_value (Any): The original parameter value, which can be an integer or a skopt space object (Real, Integer). Returns: Union[int, Real, Integer, Any]: The patched parameter value, ensuring it is >= 2. """ if isinstance(param_value, int): return max(2, param_value) elif hasattr(param_value, "rvs"): # For sampled values (e.g., skopt spaces) return ( Real(2, param_value.high, prior=param_value.prior) if isinstance(param_value, Real) else Integer(2, param_value.high) ) else: return param_value # Set up the parameter space based on the selected optimization method if global_parameters.bayessearch: # Bayesian Optimization: Define parameter space using Real and Categorical self.parameter_space = { "objective": Categorical( ["binary:logistic"] ), # Objective function for binary classification "booster": Categorical( ["gbtree", "gblinear", "dart"] ), # Type of boosting model "gamma": Real( 1e-5, 1e-2, prior="log-uniform" ), # Regularization parameter "grow_policy": Categorical( ["depthwise", "lossguide"] ), # Tree growth policy "learning_rate": Real(1e-5, 1e-2, prior="log-uniform"), # Learning rate "max_bin": Integer(2, 100), "max_depth": Integer(2, 50), "max_leaves": Integer(2, 100), "min_child_weight": Real(1e-5, 1e-2, prior="log-uniform"), "n_estimators": Integer(50, 500), "n_jobs": Categorical( [-1] ), # Number of parallel threads to use for training "random_state": Categorical([None]), # Random state for reproducibility "reg_alpha": Real(1e-5, 1e-2, prior="log-uniform"), "reg_lambda": Real(1e-5, 1e-2, prior="log-uniform"), "sampling_method": Categorical( ["uniform"] ), # Sampling method during training "verbosity": Categorical([0]), # Verbosity level during training "tree_method": Categorical(["auto"]), } # Future use parameters for Bayesian optimization # "use_label_encoder": Categorical([True, False]), # Use label encoder # "base_score": Real(0.0, 1.0, "uniform"), # Base score for predictions # "callbacks": [None], # Custom callbacks for training # "colsample_bylevel": Real(0.5, 1, "uniform"), # Column sampling by level # "colsample_bynode": Real(0.5, 1, "uniform"), # Column sampling by node # "colsample_bytree": Real(0.5, 1, "uniform"), # Column sampling by tree # "early_stopping_rounds": Categorical([None]), # Early stopping for boosting rounds # "enable_categorical": Categorical([True, False]), # Enable categorical variables (if needed) # "eval_metric": Categorical([None]), # Evaluation metric (optional) # "gpu_id": Categorical([None]), # GPU id to use for training # "importance_type": Categorical(["weight", "gain", "cover"]), # Type of feature importance calculation # "interaction_constraints": Categorical([None]), # Constraints for feature interaction # "max_cat_to_onehot": Real(1, 100, "uniform"), # Max categories for one-hot encoding # "max_delta_step": Real(0, 10, "uniform"), # Max delta step for optimization # "monotone_constraints": Categorical([None]), # Constraints for monotonicity in predictions # "num_parallel_tree": Real(1, 10, "uniform"), # Number of parallel trees in boosting # "predictor": Categorical(["cpu_predictor", "gpu_predictor"]), # Type of predictor (e.g., 'gpu_predictor') # "scale_pos_weight": Real(1, 10, "uniform"), # Scale weight for positive class # "subsample": Real(0.5, 1, "uniform"), # Subsampling ratio for training # "tree_method": Categorical(["auto", "gpu_hist", "hist"]), # Tree method for GPU (optional) # "validate_parameters": Categorical([None]), # Validate parameters during training else: # Traditional Grid Search: Define parameter space using lists self.parameter_space = { "objective": [ "binary:logistic" ], # Objective function for binary classification "booster": ["gbtree", "gblinear", "dart"], # Type of boosting model "gamma": self.parameter_vector_space.param_dict.get( "log_small" ), # Regularization parameter "grow_policy": ["depthwise", "lossguide"], # Tree growth policy "learning_rate": self.parameter_vector_space.param_dict.get( "log_small" ), # Learning rate "max_bin": self.parameter_vector_space.param_dict.get( "log_large_long" ), # Max bins for discretization "max_depth": self.parameter_vector_space.param_dict.get( "log_large_long" ), # Max depth of tree "max_leaves": self.parameter_vector_space.param_dict.get( "log_large_long" ), # Max number of leaves "min_child_weight": [None], # Minimum sum of instance weight in a child "n_estimators": self.parameter_vector_space.param_dict.get( "log_large_long" ), # Number of boosting rounds "n_jobs": [-1], # Number of parallel threads for training "random_state": [None], # Random state for reproducibility "reg_alpha": self.parameter_vector_space.param_dict.get( "log_small" ), # L1 regularization term "reg_lambda": self.parameter_vector_space.param_dict.get( "log_small" ), # L2 regularization term "sampling_method": ["uniform"], # Sampling method during training "verbosity": [0], # Verbosity level during training # Future use parameters for Grid Search (currently commented out) # "use_label_encoder": [True, False], # Use label encoder # "base_score": [0.0, 1.0], # Base score for predictions # "callbacks": [None], # Custom callbacks for training # "colsample_bylevel": [0.5, 1], # Column sampling by level # "colsample_bynode": [0.5, 1], # Column sampling by node # "colsample_bytree": [0.5, 1], # Column sampling by tree # "early_stopping_rounds": [None], # Early stopping for boosting rounds # "enable_categorical": [True, False], # Enable categorical variables (if needed) # "eval_metric": [None], # Evaluation metric (optional) # "gpu_id": [None], # GPU id to use for training # "importance_type": ["weight", "gain", "cover"], # Type of feature importance calculation # "interaction_constraints": [None], # Constraints for feature interaction # "max_cat_to_onehot": [1, 100], # Max categories for one-hot encoding # "max_delta_step": [0, 10], # Max delta step for optimization # "monotone_constraints": [None], # Constraints for monotonicity in predictions # "num_parallel_tree": [1, 10], # Number of parallel trees in boosting # "predictor": ["cpu_predictor", "gpu_predictor"], # Type of predictor (e.g., 'gpu_predictor') # "scale_pos_weight": [1, 10], # Scale weight for positive class # "subsample": [0.5, 1], # Subsampling ratio for training # "tree_method": ["auto", "gpu_hist", "hist"], # Tree method for GPU (optional) # "validate_parameters": [None], # Validate parameters during training }