Source code for ml_grid.model_classes.H2OStackedEnsembleClassifier

import logging
from typing import List

import pandas as pd

# Import H2O estimators
try:
    from h2o.estimators import H2OStackedEnsembleEstimator
except ImportError:
    logging.getLogger(__name__).warning(
        "H2OStackedEnsembleEstimator could not be imported. "
        "H2OStackedEnsembleClassifier will not be available."
    )

[docs] class H2OStackedEnsembleEstimator: ...
# Import the base class from .H2OBaseClassifier import H2OBaseClassifier # Configure logging
[docs] logger = logging.getLogger(__name__)
[docs] class H2OStackedEnsembleClassifier(H2OBaseClassifier): """A scikit-learn compatible wrapper for the H2O Stacked Ensemble classifier. This class adheres to the scikit-learn API (fit, predict, predict_proba) by inheriting from H2OBaseClassifier and uses H2OStackedEnsembleEstimator as its underlying model. This wrapper is designed to be used within the ml_grid pipeline, but has known limitations with scikit-learn's cross-validation (like GridSearchCV) due to H2O's management of base model CV predictions. The `fit` method is overridden to handle the specific requirements of a stacked ensemble, namely the `base_models` parameter. The `predict` and `predict_proba` methods are inherited from H2OBaseClassifier. """ def __init__(self, base_models: List[H2OBaseClassifier] = None, **kwargs): """Initializes the H2OStackedEnsembleClassifier. Args: base_models (List[H2OBaseClassifier], optional): A list of *unfitted* H2O classifier wrapper instances that will be trained as base learners. These models *must* be trained with `nfolds > 1` and `keep_cross_validation_predictions=True`. **kwargs: Keyword arguments passed to the H2OStackedEnsembleEstimator. Common arguments include `metalearner_algorithm`, `seed`, etc. """ # Pass base_models along with other kwargs to the parent constructor. # This ensures it's treated as a standard sklearn parameter. kwargs["base_models"] = base_models if base_models is not None else [] kwargs["estimator_class"] = H2OStackedEnsembleEstimator super().__init__(**kwargs)
[docs] def set_params(self, **kwargs): """Overrides set_params to correctly handle the `base_models` list. This is critical for scikit-learn's `clone` function to work correctly during cross-validation. Returns: H2OStackedEnsembleClassifier: The instance with updated parameters. """ super().set_params(**kwargs) return self
[docs] def get_params(self, deep: bool = True) -> dict: """Overrides get_params to ensure `base_models` is included. This allows scikit-learn's `clone` to work correctly. Returns: dict: A dictionary of the estimator's parameters. which is critical for scikit-learn's `clone` function. """ return super().get_params(deep=deep)
[docs] def score(self, X: pd.DataFrame, y: pd.Series, sample_weight=None) -> float: """ Returns the mean accuracy on the given test data and labels. This method is required for scikit-learn compatibility, especially for use with tools like GridSearchCV when no `scoring` is specified. Args: X (pd.DataFrame): Test samples. y (pd.Series): True labels for X. sample_weight: Sample weights (ignored, for API compatibility). Returns: float: The mean accuracy of the model. """ from sklearn.metrics import accuracy_score return accuracy_score(y, self.predict(X))
[docs] def fit( self, X: pd.DataFrame, y: pd.Series, **kwargs ) -> "H2OStackedEnsembleClassifier": """Fits the H2O Stacked Ensemble model, making it compatible with scikit-learn's CV tools. This method encapsulates the entire two-stage fitting process: 1. It first fits each of the base models on the provided training data, ensuring they are trained with cross-validation to generate predictions for the metalearner. 2. It then collects the model IDs of the fitted base models. 3. Finally, it trains the metalearner (the stacked ensemble model) using these base models. Args: X (pd.DataFrame): The feature matrix. y (pd.Series): The target vector. **kwargs: Additional keyword arguments (not used). Returns: H2OStackedEnsembleClassifier: The fitted classifier instance. Raises: ValueError: If `base_models` is empty or not provided. """ try: # 1. Initial validation # --- CRITICAL FIX: Call super's validation methods --- # These handle all common validation steps (Na/Inf checks, etc.) # and small data fallback, which we forgot. X, y = self._validate_input_data(X, y) if self._handle_small_data_fallback(X, y): return self if not self.base_models: raise ValueError( "`base_models` parameter is empty. " "H2OStackedEnsembleClassifier requires a " "list of base model estimators." ) # 2. Fit each base model self.logger.info( f"Fitting {len(self.base_models)} base models for StackedEnsemble..." ) base_models_list = [] for i, model_wrapper in enumerate( self.base_models ): # type:H2OBaseClassifier self.logger.debug( f"Fitting base model {i+1}: {type(model_wrapper).__name__}" ) model_wrapper.set_params( nfolds=5, # A reasonable default for base model CV keep_cross_validation_predictions=True, fold_assignment="Modulo", ) # CRITICAL FIX: Explicitly call the fit method from H2OBaseClassifier # to ensure correct data handling (pandas -> H2OFrame) and avoid the # '' error from the native H2O fit method. H2OBaseClassifier.fit(model_wrapper, X, y) base_models_list.append(model_wrapper.model_id) self.logger.info("All base models fitted.") # 3. Fit the metalearner (the ensemble itself) # The parent _prepare_fit handles data conversion and parameter extraction train_h2o, x_vars, outcome_var, model_params = self._prepare_fit(X, y) self.model_ = H2OStackedEnsembleEstimator( base_models=base_models_list, **model_params ) self.model_.train(x=x_vars, y=outcome_var, training_frame=train_h2o) # 4. Store fitted attributes for sklearn compatibility # These are inherited from the parent's _prepare_fit call self.model_id = self.model_.model_id self.logger.debug(f"Successfully fitted {self.estimator_class.__name__}") except Exception as e: self.logger.critical( f"A critical, unrecoverable error occurred during H2OStackedEnsemble fit: {e}", exc_info=True, ) raise e return self