Source code for ml_grid.model_classes.H2ODeepLearningClassifier

import pandas as pd
from h2o.estimators import H2ODeepLearningEstimator
from skopt.space import Categorical, Integer, Real

from ml_grid.util.global_params import global_parameters

from .H2OBaseClassifier import H2OBaseClassifier


[docs]
PARAM_SPACE_GRID = {
    "xsmall": {
        "epochs": [5],
        "hidden_config": ["small"],
        "activation": ["Rectifier"],
        "l1": [0],
        "l2": [0],
        "seed": [1],
    },
    "small": {
        "epochs": [5, 10],
        "hidden_config": ["small", "medium"],
        "activation": ["Rectifier", "Tanh"],
        "l1": [0, 1e-4],
        "l2": [0, 1e-4],
        "seed": [1, 42],
    },
    "medium": {
        "epochs": [10, 50, 100],
        "hidden_config": ["small", "medium", "large"],
        "activation": ["Rectifier", "Tanh", "Maxout"],
        "l1": [0, 1e-4, 1e-3],
        "l2": [0, 1e-4, 1e-3],
        "seed": [1, 42, 123],
    },
}



[docs]
PARAM_SPACE_BAYES = {
    "xsmall": {
        "epochs": Integer(5, 10),
        "hidden_config": Categorical(["small"]),
        "activation": Categorical(["Rectifier"]),
        "l1": Real(1e-5, 1e-4, "log-uniform"),
        "l2": Real(1e-5, 1e-4, "log-uniform"),
        "seed": Integer(1, 100),
    },
    "small": {
        "epochs": Integer(5, 20),
        "hidden_config": Categorical(["small", "medium", "large"]),
        "activation": Categorical(["Rectifier", "Tanh"]),
        "l1": Real(1e-5, 1e-3, "log-uniform"),
        "l2": Real(1e-5, 1e-3, "log-uniform"),
        "seed": Integer(1, 1000),
    },
    "medium": {
        "epochs": Integer(10, 200),
        "hidden_config": Categorical(["small", "medium", "large"]),
        "activation": Categorical(["Rectifier", "Tanh", "Maxout"]),
        "l1": Real(1e-6, 1e-2, "log-uniform"),
        "l2": Real(1e-6, 1e-2, "log-uniform"),
        "seed": Integer(1, 2000),
    },
}




[docs]
class H2ODeepLearningClassifier(H2OBaseClassifier):
    """A scikit-learn compatible wrapper for H2O's Deep Learning models.

    This class handles special logic for the 'hidden' layer configuration.
    """

    def __init__(
        self, hidden=None, hidden_config=None, parameter_space_size="small", **kwargs
    ):
        """Initializes the H2ODeepLearningClassifier.

        It allows specifying hidden layers either directly via 'hidden' or
        through a predefined configuration name 'hidden_config'.

        Args:
            hidden (list, optional): A list of integers specifying the number of
                neurons for each hidden layer. Defaults to None.
            hidden_config (str, optional): A string key ('small', 'medium', 'large')
                to select a predefined hidden layer architecture. Defaults to None.
            **kwargs: Additional keyword arguments passed to the H2ODeepLearningEstimator.
        """
        # Set these as instance attributes for scikit-learn compatibility

[docs]
        self.hidden = hidden


[docs]
        self.hidden_config = hidden_config


[docs]
        self.parameter_space_size = parameter_space_size


        # Remove estimator_class from kwargs if present (happens during sklearn clone)
        kwargs.pop("estimator_class", None)

        # Add our specific parameters to kwargs to be handled by the base class
        kwargs["hidden"] = self.hidden
        kwargs["hidden_config"] = self.hidden_config

        if parameter_space_size not in PARAM_SPACE_GRID:
            raise ValueError(
                f"Invalid parameter_space_size: '{parameter_space_size}'. Must be one of {list(PARAM_SPACE_GRID.keys())}"
            )

        if global_parameters.bayessearch:
            # For Bayesian search, the parameter space is a single dictionary
            self.parameter_space = PARAM_SPACE_BAYES[parameter_space_size]
        else:
            # For Grid search, the parameter space is a list of dictionaries
            self.parameter_space = [PARAM_SPACE_GRID[parameter_space_size]]

        # Pass all parameters to the super constructor
        super().__init__(estimator_class=H2ODeepLearningEstimator, **kwargs)

    def _prepare_fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Overrides the base _prepare_fit to resolve the hidden layer configuration
        before the model is instantiated.
        """
        # Call the base class's _prepare_fit to get the initial setup
        train_h2o, x_vars, outcome_var, model_params = super()._prepare_fit(X, y)

        # --- Deep Learning Specific Logic ---
        # If 'hidden' is not explicitly provided, use 'hidden_config' to set it.
        # We modify the model_params dictionary, not self.hidden.
        if model_params.get("hidden") is None:
            config_name = model_params.get("hidden_config") or "medium"

            hidden_layer_configs = {
                "small": [10, 10],
                "medium": [50, 50],
                "large": [100, 100, 100],
            }

            resolved_hidden = hidden_layer_configs.get(config_name, [50, 50])
            model_params["hidden"] = resolved_hidden
            self.logger.debug(
                f"Resolved hidden layers from config '{config_name}' to {resolved_hidden}"
            )

        # Remove the wrapper-only 'hidden_config' parameter before training
        model_params.pop("hidden_config", None)

        return train_h2o, x_vars, outcome_var, model_params


    # The fit() method is now inherited from H2OBaseClassifier and will use the
    # parameters returned by our overridden _prepare_fit().