Source code for ml_grid.model_classes.H2OGBMClassifier

import pandas as pd
from h2o.estimators import H2OGradientBoostingEstimator
from skopt.space import Integer, Real

from ml_grid.util.global_params import global_parameters

from .H2OBaseClassifier import H2OBaseClassifier

# Define parameter spaces outside the class for better organization and reusability.

[docs]
PARAM_SPACE_GRID = {
    "xsmall": {
        "ntrees": [50],
        "max_depth": [5],
        "learn_rate": [0.1],
        "sample_rate": [0.8],
        "col_sample_rate": [0.8],
        "seed": [1],
    },
    "small": {
        "ntrees": [50, 100, 200],
        "max_depth": [3, 5, 10],
        "learn_rate": [0.01, 0.1],
        "sample_rate": [0.8, 1.0],
        "col_sample_rate": [0.8, 1.0],
        "seed": [1, 42],
    },
    "medium": {
        "ntrees": [50, 100, 200, 300],
        "max_depth": [3, 5, 10, 15],
        "learn_rate": [0.01, 0.05, 0.1],
        "sample_rate": [0.7, 0.8, 0.9, 1.0],
        "col_sample_rate": [0.7, 0.8, 0.9, 1.0],
        "seed": [1, 42, 123],
    },
}



[docs]
PARAM_SPACE_BAYES = {
    "xsmall": {
        "ntrees": Integer(50, 100),
        "max_depth": Integer(3, 5),
        "learn_rate": Real(0.05, 0.15, "log-uniform"),
        "sample_rate": Real(0.7, 0.9),
        "col_sample_rate": Real(0.7, 0.9),
        "seed": Integer(1, 100),
    },
    "small": {
        "ntrees": Integer(50, 500),
        "max_depth": Integer(3, 10),
        "learn_rate": Real(0.01, 0.2, "log-uniform"),
        "sample_rate": Real(0.5, 1.0),
        "col_sample_rate": Real(0.5, 1.0),
        "seed": Integer(1, 1000),
    },
    "medium": {
        "ntrees": Integer(50, 1000),
        "max_depth": Integer(3, 20),
        "learn_rate": Real(0.005, 0.2, "log-uniform"),
        "sample_rate": Real(0.5, 1.0),
        "col_sample_rate": Real(0.5, 1.0),
        "seed": Integer(1, 2000),
    },
}




[docs]
class H2OGBMClassifier(H2OBaseClassifier):
    """A scikit-learn compatible wrapper for H2O's Gradient Boosting Machine.

    This class allows H2O's GBM to be used as a standard scikit-learn
    classifier, making it compatible with tools like GridSearchCV and
    BayesSearchCV.
    """

    def __init__(self, parameter_space_size="small", **kwargs):
        """Initializes the H2OGBMClassifier.

        All keyword arguments are passed directly to the H2OGradientBoostingEstimator.
        Example args: ntrees=50, max_depth=5, learn_rate=0.1, seed=1
        """
        # Remove estimator_class from kwargs if present (happens during sklearn clone)
        kwargs.pop("estimator_class", None)


[docs]
        self.parameter_space_size = parameter_space_size


        if parameter_space_size not in PARAM_SPACE_GRID:
            raise ValueError(
                f"Invalid parameter_space_size: '{parameter_space_size}'. Must be one of {list(PARAM_SPACE_GRID.keys())}"
            )

        if global_parameters.bayessearch:
            # For Bayesian search, the parameter space is a single dictionary
            self.parameter_space = PARAM_SPACE_BAYES[parameter_space_size]
        else:
            # For Grid search, the parameter space is a list of dictionaries
            self.parameter_space = [PARAM_SPACE_GRID[parameter_space_size]]

        # Pass estimator_class as a keyword argument
        super().__init__(estimator_class=H2OGradientBoostingEstimator, **kwargs)

    def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Overrides the base _prepare_fit to add GBM-specific parameter validation.
        """
        # Call the base class's _prepare_fit to get the initial setup
        train_h2o, x_vars, outcome_var, model_params = super()._prepare_fit(X, y)

        n_samples = len(train_h2o)
        max_allowed_min_rows = max(1.0, n_samples / 2.0) if n_samples > 0 else 1.0

        current_min_rows = model_params.get("min_rows", 10.0)

        if current_min_rows > max_allowed_min_rows:
            self.logger.warning(
                f"Adjusting 'min_rows' from {current_min_rows} to {max_allowed_min_rows} to prevent H2O error."
            )
            model_params["min_rows"] = max_allowed_min_rows

        return train_h2o, x_vars, outcome_var, model_params


    # The fit() method is now inherited from H2OBaseClassifier.