Source code for ml_grid.model_classes.xgb_classifier_class

from typing import Any, Optional, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from ml_grid.util import param_space
from skopt.space import Real, Categorical, Integer
from ml_grid.util.global_params import global_parameters

print("Imported XGB class")


[docs]
class XGB_class_class:
    """XGBoost classifier with support for Bayesian and Grid Search parameter spaces."""

    def __init__(
        self,
        X: Optional[pd.DataFrame] = None,
        y: Optional[pd.Series] = None,
        parameter_space_size: Optional[str] = None,
    ):
        """Initializes the XGB_class_class.

        The XGB_class_class wraps the XGBoost classifier algorithm, allowing for
        easy configuration and use within a grid search or Bayesian optimization
        framework by setting up a customizable parameter space.

        Args:
            X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None.
            y (Optional[pd.Series]): Target vector for training. Defaults to None.
            parameter_space_size (Optional[str]): Size of the parameter space for
                optimization. Defaults to None.
        """

[docs]
        self.X = X


[docs]
        self.y = y


        # Initialize the algorithm implementation using XGBClassifier

[docs]
        self.algorithm_implementation = xgb.XGBClassifier()


[docs]
        self.method_name = "XGBClassifier"


        # Initialize the parameter space handler

[docs]
        self.parameter_vector_space = param_space.ParamSpace(parameter_space_size)


        # Patch max_bin dynamically for compatibility
        def patch_max_bin(param_value: Any) -> Union[int, Real, Integer, Any]:
            """Ensures the 'max_bin' parameter is >= 2.

            XGBoost's 'max_bin' parameter must be at least 2. This function
            patches the provided value to meet this requirement, handling integers
            and skopt space objects.

            Args:
                param_value (Any): The original parameter value, which can be an
                    integer or a skopt space object (Real, Integer).

            Returns:
                Union[int, Real, Integer, Any]: The patched parameter value,
                ensuring it is >= 2.
            """
            if isinstance(param_value, int):
                return max(2, param_value)
            elif hasattr(param_value, "rvs"):  # For sampled values (e.g., skopt spaces)
                return Real(2, param_value.high, prior=param_value.prior) if isinstance(param_value, Real) else Integer(2, param_value.high)
            else:
                return param_value

        # Set up the parameter space based on the selected optimization method
        if global_parameters.bayessearch:
            # Bayesian Optimization: Define parameter space using Real and Categorical
            self.parameter_space = [{
                "objective": Categorical(["binary:logistic"]),  # Objective function for binary classification
                "booster": Categorical(["gbtree", "gblinear", "dart"]),  # Type of boosting model
                "gamma": self.parameter_vector_space.param_dict.get("log_small"),  # Regularization parameter
                "grow_policy": Categorical(["depthwise", "lossguide"]),  # Tree growth policy
                "learning_rate": self.parameter_vector_space.param_dict.get("log_small"),  # Learning rate
                "max_bin": patch_max_bin(self.parameter_vector_space.param_dict.get("log_large_long")),  # Max bins for discretization
                "max_depth": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max depth of tree
                "max_leaves": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max number of leaves
                "min_child_weight": self.parameter_vector_space.param_dict.get("log_small"),  # Minimum sum of instance weight in a child
                "n_estimators": self.parameter_vector_space.param_dict.get("log_large_long"),  # Number of boosting rounds
                "n_jobs": Categorical([-1]),  # Number of parallel threads to use for training
                "random_state": Categorical([None]),  # Random state for reproducibility
                "reg_alpha": self.parameter_vector_space.param_dict.get("log_small"),  # L1 regularization term
                "reg_lambda": self.parameter_vector_space.param_dict.get("log_small"),  # L2 regularization term
                "sampling_method": Categorical(["uniform"]),  # Sampling method during training
                "verbosity": Categorical([0]),  # Verbosity level during training
                "tree_method": Categorical(["auto"])
                

                },{
                    "objective": Categorical(["binary:logistic"]),  # Objective function for binary classification
                "booster": Categorical(["gbtree", "gblinear", "dart"]),  # Type of boosting model
                "gamma": self.parameter_vector_space.param_dict.get("log_small"),  # Regularization parameter
                "grow_policy": Categorical(["depthwise", "lossguide"]),  # Tree growth policy
                "learning_rate": self.parameter_vector_space.param_dict.get("log_small"),  # Learning rate
                "max_bin": patch_max_bin(self.parameter_vector_space.param_dict.get("log_large_long")),  # Max bins for discretization
                "max_depth": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max depth of tree
                "max_leaves": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max number of leaves
                "min_child_weight": self.parameter_vector_space.param_dict.get("log_small"),  # Minimum sum of instance weight in a child
                "n_estimators": self.parameter_vector_space.param_dict.get("log_large_long"),  # Number of boosting rounds
                "n_jobs": Categorical([-1]),  # Number of parallel threads to use for training
                "random_state": Categorical([None]),  # Random state for reproducibility
                "reg_alpha": self.parameter_vector_space.param_dict.get("log_small"),  # L1 regularization term
                "reg_lambda": self.parameter_vector_space.param_dict.get("log_small"),  # L2 regularization term
                "sampling_method": Categorical(["uniform"]),  # Sampling method during training
                "verbosity": Categorical([0]),  # Verbosity level during training
                "tree_method": Categorical(["auto"])

                #value 1 for Parameter max_bin should be greater equal to 2
                #max_bin: if using histogram-based algorithm, maximum number of bins per feature
                    
                    
                }]


                # Future use parameters for Bayesian optimization
                # "use_label_encoder": Categorical([True, False]),  # Use label encoder
                # "base_score": Real(0.0, 1.0, "uniform"),  # Base score for predictions
                # "callbacks": [None],  # Custom callbacks for training
                # "colsample_bylevel": Real(0.5, 1, "uniform"),  # Column sampling by level
                # "colsample_bynode": Real(0.5, 1, "uniform"),  # Column sampling by node
                # "colsample_bytree": Real(0.5, 1, "uniform"),  # Column sampling by tree
                # "early_stopping_rounds": Categorical([None]),  # Early stopping for boosting rounds
                # "enable_categorical": Categorical([True, False]),  # Enable categorical variables (if needed)
                # "eval_metric": Categorical([None]),  # Evaluation metric (optional)
                # "gpu_id": Categorical([None]),  # GPU id to use for training
                # "importance_type": Categorical(["weight", "gain", "cover"]),  # Type of feature importance calculation
                # "interaction_constraints": Categorical([None]),  # Constraints for feature interaction
                # "max_cat_to_onehot": Real(1, 100, "uniform"),  # Max categories for one-hot encoding
                # "max_delta_step": Real(0, 10, "uniform"),  # Max delta step for optimization
                # "monotone_constraints": Categorical([None]),  # Constraints for monotonicity in predictions
                # "num_parallel_tree": Real(1, 10, "uniform"),  # Number of parallel trees in boosting
                # "predictor": Categorical(["cpu_predictor", "gpu_predictor"]),  # Type of predictor (e.g., 'gpu_predictor')
                # "scale_pos_weight": Real(1, 10, "uniform"),  # Scale weight for positive class
                # "subsample": Real(0.5, 1, "uniform"),  # Subsampling ratio for training
                # "tree_method": Categorical(["auto", "gpu_hist", "hist"]),  # Tree method for GPU (optional)
                # "validate_parameters": Categorical([None]),  # Validate parameters during training
            
        else:
            # Traditional Grid Search: Define parameter space using lists
            self.parameter_space = {
                "objective": ["binary:logistic"],  # Objective function for binary classification
                "booster": ["gbtree", "gblinear", "dart"],  # Type of boosting model
                "gamma": self.parameter_vector_space.param_dict.get("log_small"),  # Regularization parameter
                "grow_policy": ["depthwise", "lossguide"],  # Tree growth policy
                "learning_rate": self.parameter_vector_space.param_dict.get("log_small"),  # Learning rate
                "max_bin": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max bins for discretization
                "max_depth": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max depth of tree
                "max_leaves": self.parameter_vector_space.param_dict.get("log_large_long"),  # Max number of leaves
                "min_child_weight": [None],  # Minimum sum of instance weight in a child
                "n_estimators": self.parameter_vector_space.param_dict.get("log_large_long"),  # Number of boosting rounds
                "n_jobs": [-1],  # Number of parallel threads for training
                "random_state": [None],  # Random state for reproducibility
                "reg_alpha": self.parameter_vector_space.param_dict.get("log_small"),  # L1 regularization term
                "reg_lambda": self.parameter_vector_space.param_dict.get("log_small"),  # L2 regularization term
                "sampling_method": ["uniform"],  # Sampling method during training
                "verbosity": [0],  # Verbosity level during training

                # Future use parameters for Grid Search (currently commented out)
                # "use_label_encoder": [True, False],  # Use label encoder
                # "base_score": [0.0, 1.0],  # Base score for predictions
                # "callbacks": [None],  # Custom callbacks for training
                # "colsample_bylevel": [0.5, 1],  # Column sampling by level
                # "colsample_bynode": [0.5, 1],  # Column sampling by node
                # "colsample_bytree": [0.5, 1],  # Column sampling by tree
                # "early_stopping_rounds": [None],  # Early stopping for boosting rounds
                # "enable_categorical": [True, False],  # Enable categorical variables (if needed)
                # "eval_metric": [None],  # Evaluation metric (optional)
                # "gpu_id": [None],  # GPU id to use for training
                # "importance_type": ["weight", "gain", "cover"],  # Type of feature importance calculation
                # "interaction_constraints": [None],  # Constraints for feature interaction
                # "max_cat_to_onehot": [1, 100],  # Max categories for one-hot encoding
                # "max_delta_step": [0, 10],  # Max delta step for optimization
                # "monotone_constraints": [None],  # Constraints for monotonicity in predictions
                # "num_parallel_tree": [1, 10],  # Number of parallel trees in boosting
                # "predictor": ["cpu_predictor", "gpu_predictor"],  # Type of predictor (e.g., 'gpu_predictor')
                # "scale_pos_weight": [1, 10],  # Scale weight for positive class
                # "subsample": [0.5, 1],  # Subsampling ratio for training
                # "tree_method": ["auto", "gpu_hist", "hist"],  # Tree method for GPU (optional)
                # "validate_parameters": [None],  # Validate parameters during training
            }

        return None