Source code for ml_grid.model_classes.H2OAutoMLClassifier

from typing import Any, Dict, Optional

import h2o
from h2o.automl import H2OAutoML
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted


[docs] class H2OAutoMLClassifier(BaseEstimator, ClassifierMixin): """A scikit-learn compatible wrapper for H2O's AutoML. This class allows H2O's AutoML to be used as a standard scikit-learn classifier, making it compatible with tools like GridSearchCV and BayesSearchCV. """ def __init__( self, max_runtime_secs: int = 360, nfolds: int = 2, seed: int = 1 ): """Initializes the H2OAutoMLClassifier. Args: max_runtime_secs (int): Maximum time in seconds to run the AutoML process. nfolds (int): Number of folds for cross-validation. seed (int): Random seed for reproducibility. """
[docs] self.max_runtime_secs = max_runtime_secs
[docs] self.nfolds = nfolds
[docs] self.seed = seed
[docs] self.automl: Optional[H2OAutoML] = None
[docs] self.classes_: Optional[np.ndarray] = None
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> "H2OAutoMLClassifier": """Fits the H2O AutoML model. This method initializes an H2O cluster, converts the pandas DataFrame and Series to H2O Frames, and then trains the AutoML model. Args: X (pd.DataFrame): The training input samples. y (pd.Series): The target values. Returns: H2OAutoMLClassifier: The fitted estimator. """ self.classes_ = np.unique(y) try: outcome_var = y.columns[0] except: outcome_var = y.name x = list(X.columns) y_n = outcome_var try: x.remove(y_n) except: pass h2o.init() train_df = pd.concat([X, y], axis=1) train_h2o = h2o.H2OFrame(train_df) train_h2o[y_n] = train_h2o[y_n].asfactor() self.automl = H2OAutoML( max_runtime_secs=self.max_runtime_secs, max_models=5, nfolds=self.nfolds, seed=self.seed, ) self.automl.train(y=y_n, x=x, training_frame=train_h2o) return self
[docs] def predict(self, X: pd.DataFrame) -> np.ndarray: """Predicts class labels for samples in X. Args: X (pd.DataFrame): The input samples to predict. Returns: np.ndarray: The predicted class labels. """ check_is_fitted(self) test_h2o = h2o.H2OFrame(X) predictions = self.automl.leader.predict(test_h2o) return predictions["predict"].as_data_frame().values
[docs] def predict_proba(self, X: pd.DataFrame) -> np.ndarray: """Predicts class probabilities for samples in X. Note: This method is not implemented for H2O AutoML. Args: X (pd.DataFrame): The input samples. Raises: NotImplementedError: H2O AutoML does not support predict_proba. """ raise NotImplementedError("H2O AutoML does not support predict_proba.")
[docs] def get_params(self, deep: bool = True) -> Dict[str, Any]: """Gets parameters for this estimator. Args: deep (bool): If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: Dict[str, Any]: Parameter names mapped to their values. """ return { "max_runtime_secs": self.max_runtime_secs, "nfolds": self.nfolds, "seed": self.seed, }
[docs] def set_params(self, **params: Any) -> "H2OAutoMLClassifier": """Sets the parameters of this estimator. Args: **params (Any): Estimator parameters. Returns: H2OAutoMLClassifier: The instance with updated parameters. """ for param, value in params.items(): setattr(self, param, value) return self
[docs] def get_leader_params(self) -> Dict[str, Any]: """Gets the parameters of the best model found by AutoML. Returns: Dict[str, Any]: A dictionary of the leader model's parameters. """ check_is_fitted(self) return self.automl.leader.params