Source code for ml_grid.model_classes.xgb_classifier_class

from typing import Any, Optional, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from ml_grid.util import param_space
from skopt.space import Real, Categorical, Integer
from ml_grid.util.global_params import global_parameters

print("Imported XGB class")

[docs] class XGB_class_class: """XGBoost classifier with support for Bayesian and Grid Search parameter spaces.""" def __init__( self, X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, parameter_space_size: Optional[str] = None, ): """Initializes the XGB_class_class. The XGB_class_class wraps the XGBoost classifier algorithm, allowing for easy configuration and use within a grid search or Bayesian optimization framework by setting up a customizable parameter space. Args: X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None. y (Optional[pd.Series]): Target vector for training. Defaults to None. parameter_space_size (Optional[str]): Size of the parameter space for optimization. Defaults to None. """
[docs] self.X = X
[docs] self.y = y
# Initialize the algorithm implementation using XGBClassifier
[docs] self.algorithm_implementation = xgb.XGBClassifier()
[docs] self.method_name = "XGBClassifier"
# Initialize the parameter space handler
[docs] self.parameter_vector_space = param_space.ParamSpace(parameter_space_size)
# Patch max_bin dynamically for compatibility def patch_max_bin(param_value: Any) -> Union[int, Real, Integer, Any]: """Ensures the 'max_bin' parameter is >= 2. XGBoost's 'max_bin' parameter must be at least 2. This function patches the provided value to meet this requirement, handling integers and skopt space objects. Args: param_value (Any): The original parameter value, which can be an integer or a skopt space object (Real, Integer). Returns: Union[int, Real, Integer, Any]: The patched parameter value, ensuring it is >= 2. """ if isinstance(param_value, int): return max(2, param_value) elif hasattr(param_value, "rvs"): # For sampled values (e.g., skopt spaces) return Real(2, param_value.high, prior=param_value.prior) if isinstance(param_value, Real) else Integer(2, param_value.high) else: return param_value # Set up the parameter space based on the selected optimization method if global_parameters.bayessearch: # Bayesian Optimization: Define parameter space using Real and Categorical self.parameter_space = [{ "objective": Categorical(["binary:logistic"]), # Objective function for binary classification "booster": Categorical(["gbtree", "gblinear", "dart"]), # Type of boosting model "gamma": self.parameter_vector_space.param_dict.get("log_small"), # Regularization parameter "grow_policy": Categorical(["depthwise", "lossguide"]), # Tree growth policy "learning_rate": self.parameter_vector_space.param_dict.get("log_small"), # Learning rate "max_bin": patch_max_bin(self.parameter_vector_space.param_dict.get("log_large_long")), # Max bins for discretization "max_depth": self.parameter_vector_space.param_dict.get("log_large_long"), # Max depth of tree "max_leaves": self.parameter_vector_space.param_dict.get("log_large_long"), # Max number of leaves "min_child_weight": self.parameter_vector_space.param_dict.get("log_small"), # Minimum sum of instance weight in a child "n_estimators": self.parameter_vector_space.param_dict.get("log_large_long"), # Number of boosting rounds "n_jobs": Categorical([-1]), # Number of parallel threads to use for training "random_state": Categorical([None]), # Random state for reproducibility "reg_alpha": self.parameter_vector_space.param_dict.get("log_small"), # L1 regularization term "reg_lambda": self.parameter_vector_space.param_dict.get("log_small"), # L2 regularization term "sampling_method": Categorical(["uniform"]), # Sampling method during training "verbosity": Categorical([0]), # Verbosity level during training "tree_method": Categorical(["auto"]) },{ "objective": Categorical(["binary:logistic"]), # Objective function for binary classification "booster": Categorical(["gbtree", "gblinear", "dart"]), # Type of boosting model "gamma": self.parameter_vector_space.param_dict.get("log_small"), # Regularization parameter "grow_policy": Categorical(["depthwise", "lossguide"]), # Tree growth policy "learning_rate": self.parameter_vector_space.param_dict.get("log_small"), # Learning rate "max_bin": patch_max_bin(self.parameter_vector_space.param_dict.get("log_large_long")), # Max bins for discretization "max_depth": self.parameter_vector_space.param_dict.get("log_large_long"), # Max depth of tree "max_leaves": self.parameter_vector_space.param_dict.get("log_large_long"), # Max number of leaves "min_child_weight": self.parameter_vector_space.param_dict.get("log_small"), # Minimum sum of instance weight in a child "n_estimators": self.parameter_vector_space.param_dict.get("log_large_long"), # Number of boosting rounds "n_jobs": Categorical([-1]), # Number of parallel threads to use for training "random_state": Categorical([None]), # Random state for reproducibility "reg_alpha": self.parameter_vector_space.param_dict.get("log_small"), # L1 regularization term "reg_lambda": self.parameter_vector_space.param_dict.get("log_small"), # L2 regularization term "sampling_method": Categorical(["uniform"]), # Sampling method during training "verbosity": Categorical([0]), # Verbosity level during training "tree_method": Categorical(["auto"]) #value 1 for Parameter max_bin should be greater equal to 2 #max_bin: if using histogram-based algorithm, maximum number of bins per feature }] # Future use parameters for Bayesian optimization # "use_label_encoder": Categorical([True, False]), # Use label encoder # "base_score": Real(0.0, 1.0, "uniform"), # Base score for predictions # "callbacks": [None], # Custom callbacks for training # "colsample_bylevel": Real(0.5, 1, "uniform"), # Column sampling by level # "colsample_bynode": Real(0.5, 1, "uniform"), # Column sampling by node # "colsample_bytree": Real(0.5, 1, "uniform"), # Column sampling by tree # "early_stopping_rounds": Categorical([None]), # Early stopping for boosting rounds # "enable_categorical": Categorical([True, False]), # Enable categorical variables (if needed) # "eval_metric": Categorical([None]), # Evaluation metric (optional) # "gpu_id": Categorical([None]), # GPU id to use for training # "importance_type": Categorical(["weight", "gain", "cover"]), # Type of feature importance calculation # "interaction_constraints": Categorical([None]), # Constraints for feature interaction # "max_cat_to_onehot": Real(1, 100, "uniform"), # Max categories for one-hot encoding # "max_delta_step": Real(0, 10, "uniform"), # Max delta step for optimization # "monotone_constraints": Categorical([None]), # Constraints for monotonicity in predictions # "num_parallel_tree": Real(1, 10, "uniform"), # Number of parallel trees in boosting # "predictor": Categorical(["cpu_predictor", "gpu_predictor"]), # Type of predictor (e.g., 'gpu_predictor') # "scale_pos_weight": Real(1, 10, "uniform"), # Scale weight for positive class # "subsample": Real(0.5, 1, "uniform"), # Subsampling ratio for training # "tree_method": Categorical(["auto", "gpu_hist", "hist"]), # Tree method for GPU (optional) # "validate_parameters": Categorical([None]), # Validate parameters during training else: # Traditional Grid Search: Define parameter space using lists self.parameter_space = { "objective": ["binary:logistic"], # Objective function for binary classification "booster": ["gbtree", "gblinear", "dart"], # Type of boosting model "gamma": self.parameter_vector_space.param_dict.get("log_small"), # Regularization parameter "grow_policy": ["depthwise", "lossguide"], # Tree growth policy "learning_rate": self.parameter_vector_space.param_dict.get("log_small"), # Learning rate "max_bin": self.parameter_vector_space.param_dict.get("log_large_long"), # Max bins for discretization "max_depth": self.parameter_vector_space.param_dict.get("log_large_long"), # Max depth of tree "max_leaves": self.parameter_vector_space.param_dict.get("log_large_long"), # Max number of leaves "min_child_weight": [None], # Minimum sum of instance weight in a child "n_estimators": self.parameter_vector_space.param_dict.get("log_large_long"), # Number of boosting rounds "n_jobs": [-1], # Number of parallel threads for training "random_state": [None], # Random state for reproducibility "reg_alpha": self.parameter_vector_space.param_dict.get("log_small"), # L1 regularization term "reg_lambda": self.parameter_vector_space.param_dict.get("log_small"), # L2 regularization term "sampling_method": ["uniform"], # Sampling method during training "verbosity": [0], # Verbosity level during training # Future use parameters for Grid Search (currently commented out) # "use_label_encoder": [True, False], # Use label encoder # "base_score": [0.0, 1.0], # Base score for predictions # "callbacks": [None], # Custom callbacks for training # "colsample_bylevel": [0.5, 1], # Column sampling by level # "colsample_bynode": [0.5, 1], # Column sampling by node # "colsample_bytree": [0.5, 1], # Column sampling by tree # "early_stopping_rounds": [None], # Early stopping for boosting rounds # "enable_categorical": [True, False], # Enable categorical variables (if needed) # "eval_metric": [None], # Evaluation metric (optional) # "gpu_id": [None], # GPU id to use for training # "importance_type": ["weight", "gain", "cover"], # Type of feature importance calculation # "interaction_constraints": [None], # Constraints for feature interaction # "max_cat_to_onehot": [1, 100], # Max categories for one-hot encoding # "max_delta_step": [0, 10], # Max delta step for optimization # "monotone_constraints": [None], # Constraints for monotonicity in predictions # "num_parallel_tree": [1, 10], # Number of parallel trees in boosting # "predictor": ["cpu_predictor", "gpu_predictor"], # Type of predictor (e.g., 'gpu_predictor') # "scale_pos_weight": [1, 10], # Scale weight for positive class # "subsample": [0.5, 1], # Subsampling ratio for training # "tree_method": ["auto", "gpu_hist", "hist"], # Tree method for GPU (optional) # "validate_parameters": [None], # Validate parameters during training } return None