Source code for ml_grid.model_classes.H2OAutoMLClassifier

import logging

import pandas as pd
from h2o.automl import H2OAutoML
from h2o.estimators import H2OGeneralizedLinearEstimator

from .H2OBaseClassifier import H2OBaseClassifier

[docs] logger = logging.getLogger(__name__)
[docs] class H2OAutoMLClassifier(H2OBaseClassifier): """A scikit-learn compatible wrapper for H2O's AutoML.""" def __init__(self, **kwargs): """Initializes the H2OAutoMLClassifier. Note: H2OAutoML is not a standard estimator, so we use a placeholder in the base class and manage the AutoML process within the `fit` method. """ # Use a placeholder estimator. The actual model will be the AutoML leader. # This allows us to use the base class infrastructure. super().__init__(estimator_class=H2OGeneralizedLinearEstimator, **kwargs)
[docs] self.automl = None
[docs] def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> "H2OAutoMLClassifier": """Fits the H2O AutoML process. If the dataset is too small or AutoML fails to find a leader model, it gracefully falls back to a simple GLM model. Args: X (pd.DataFrame): The feature matrix. y (pd.Series): The target vector. **kwargs: Additional keyword arguments (not used by this implementation). Returns: H2OAutoMLClassifier: The fitted classifier instance. """ # --- 1. Standard Validation and Preparation --- # Use base class methods for validation. This also handles small data errors. X, y = self._validate_input_data(X, y) # Handle small data fallback before preparing H2O frames if self._handle_small_data_fallback(X, y): # If a dummy model was fit, we can finalize and return return self._finalize_dummy_fit(X, y) train_h2o, x_vars, outcome_var, model_params = self._prepare_fit(X, y) # --- 2. AutoML Specific Checks and Execution --- min_samples = 20 # A reasonable minimum for AutoML run_automl = len(train_h2o) >= min_samples and len(x_vars) >= 1 if run_automl: self.logger.info("Dataset is large enough. Running H2O AutoML...") self.automl = H2OAutoML(**model_params) self.automl.train(y=outcome_var, x=x_vars, training_frame=train_h2o) # The best model found by AutoML becomes our main model if self.automl.leader: self.logger.info( f"AutoML found a leader model: {self.automl.leader.model_id}" ) self.model_ = self.automl.leader else: self.logger.warning( "H2O AutoML finished but found no leader model. Falling back to a simple GLM." ) run_automl = False # Trigger the fallback logic if not run_automl: self.logger.warning( f"Dataset too small for H2O AutoML or AutoML failed. " f"({len(train_h2o)} rows, {len(x_vars)} features). " f"Fitting a simple GLM as a fallback." ) # Use a simple, robust GLM as a fallback model self.model_ = H2OGeneralizedLinearEstimator( family="binomial", ignore_const_cols=False ) self.model_.train(y=outcome_var, x=x_vars, training_frame=train_h2o) self._using_dummy_model = True # Set flag for reference # --- 3. Finalize Fit using Base Class Standards --- # CRITICAL: Store the model_id for persistence and retrieval. # This allows the base class predict/predict_proba to work correctly. if self.model_: self.model_id = self.model_.model_id self.logger.info( f"✓✓✓ SUCCESS: H2OAutoMLClassifier is fitted. Final model_id: {self.model_id}" ) else: raise RuntimeError("H2OAutoMLClassifier failed to produce a final model.") return self
def _finalize_dummy_fit(self, X, y): """Finalizes the fitting process when a dummy model is used.""" self.logger.info("Finalizing fit for dummy GLM model.") # Use a simple, robust GLM as a fallback model self.model_ = H2OGeneralizedLinearEstimator( family="binomial", ignore_const_cols=False ) # We need to create a minimal H2OFrame to train on train_h2o, x_vars, outcome_var, _ = self._prepare_fit(X, y) self.model_.train(y=outcome_var, x=x_vars, training_frame=train_h2o) # Set the model_id for predict() to work self.model_id = self.model_.model_id self.logger.info( f"✓✓✓ SUCCESS: H2OAutoMLClassifier is fitted with a fallback model. Final model_id: {self.model_id}" ) return self
[docs] def shutdown(self): """Shuts down the H2O cluster using the base class's safe logic.""" # The base class __del__ handles cleanup. A specific shutdown method # is better handled at the pipeline level. pass
# predict() and predict_proba() are now inherited from H2OBaseClassifier # and will work correctly because we set self.model_ and self.model_id.