Source code for ml_grid.model_classes.catboost_classifier_class

"""CatBoost Classifier.

This module contains the CatBoostClassifierClass, which is a configuration
class for the CatBoostClassifier. It provides parameter spaces for
grid search and Bayesian optimization.
"""

import logging
from typing import Any, Dict, List, Optional, Union

import pandas as pd
from catboost import CatBoostClassifier
from skopt.space import Categorical, Integer, Real

from ml_grid.util.global_params import global_parameters


[docs] class CatBoostClassifierClass: """A class for the CatBoost Classifier. This class encapsulates the CatBoostClassifier, providing a flexible way to define parameter spaces for hyperparameter tuning. It supports both Bayesian optimization using `skopt` and traditional grid/random search. """ def __init__( self, X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, parameter_space_size: Optional[str] = None, ) -> None: """Initializes the CatBoostClassifierClass. Args: X (Optional[pd.DataFrame]): The input features. Defaults to None. y (Optional[pd.Series]): The target variable. Defaults to None. parameter_space_size (Optional[str]): The size of the parameter space. Defaults to None. Raises: ValueError: If `parameter_space_size` is not a valid key (though current implementation does not explicitly raise this). """
[docs] self.X: Optional[pd.DataFrame] = X
[docs] self.y: Optional[pd.Series] = y
# Use CatBoostClassifier directly
[docs] self.algorithm_implementation: CatBoostClassifier = CatBoostClassifier()
[docs] self.method_name: str = "CatBoostClassifier"
[docs] self.parameter_space: Union[List[Dict[str, Any]], Dict[str, Any]]
# Define parameter space for Bayesian search or traditional grid search if global_parameters.bayessearch: self.parameter_space = { "iterations": Integer(100, 1000), "learning_rate": Real(0.01, 0.3, prior="uniform"), "depth": Integer(4, 10), "l2_leaf_reg": Real(1e-5, 1, prior="log-uniform"), "random_strength": Real(1e-5, 1, prior="log-uniform"), "rsm": Real(0.8, 1.0, prior="uniform"), "loss_function": Categorical(["Logloss", "CrossEntropy"]), "eval_metric": Categorical(["Accuracy", "AUC"]), "bootstrap_type": Categorical(["Bernoulli", "MVS"]), "subsample": Real(0.8, 1.0, prior="uniform"), "max_bin": Integer(32, 128), "grow_policy": Categorical(["SymmetricTree", "Depthwise", "Lossguide"]), "min_data_in_leaf": Integer(1, 7), "one_hot_max_size": Integer(2, 10), "leaf_estimation_method": Categorical(["Newton", "Gradient"]), "fold_permutation_block": Integer(1, 5), "od_pval": Real(1e-9, 0.1, prior="log-uniform"), "od_wait": Integer(10, 30), "verbose": Categorical([0]), "allow_const_label": Categorical([True]), } logging.getLogger("ml_grid").debug( f"Bayesian Parameter Space for CatBoost: {self.parameter_space}" ) else: # Grid search parameter space must be a list of dicts self.parameter_space = [ { "iterations": [100, 200, 500, 1000], "learning_rate": [0.01, 0.05, 0.1, 0.3], "depth": [4, 6, 8, 10], "l2_leaf_reg": [1e-5, 1e-3, 0.1, 1], "random_strength": [1e-5, 1e-3, 0.1, 1], "rsm": [0.8, 1], "loss_function": ["Logloss", "CrossEntropy"], "eval_metric": ["Accuracy", "AUC"], "bootstrap_type": ["Bernoulli", "MVS"], "subsample": [0.8, 1], "max_bin": [32, 64, 128], "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"], "min_data_in_leaf": [1, 3, 5, 7], "one_hot_max_size": [2, 5, 10], "leaf_estimation_method": ["Newton", "Gradient"], "fold_permutation_block": [1, 3, 5], "od_pval": [1e-9, 1e-7, 1e-5, 1e-3], "od_wait": [10, 20, 30], "verbose": [0], "allow_const_label": [True], } ] logging.getLogger("ml_grid").debug( f"Traditional Parameter Space for CatBoost: {self.parameter_space}" )