Source code for ml_grid.model_classes.lightgbm_class

from typing import Optional, Union

import lightgbm as lgb
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
import re
import numpy as np



[docs]
class LightGBMClassifier(BaseEstimator, ClassifierMixin):
    """A scikit-learn compatible wrapper for the LightGBM classifier.

    This wrapper handles potential issues with special characters in feature
    names that LightGBM does not support by sanitizing column names before
    fitting and predicting.
    """

    def __init__(
        self,
        boosting_type: str = "gbdt",
        num_leaves: int = 31,
        learning_rate: float = 0.05,
        n_estimators: int = 100,
        objective: str = "multiclass",
        num_class: int = 1,
        metric: str = "multi_logloss",
        feature_fraction: float = 0.9,
        early_stopping_rounds: Optional[int] = None,
        verbosity: int = -1,
    ):
        """Initializes the LightGBMClassifier wrapper.

        Args:
            boosting_type (str): The type of boosting to use.
            num_leaves (int): Maximum number of leaves in one tree.
            learning_rate (float): Boosting learning rate.
            n_estimators (int): Number of boosting rounds.
            objective (str): The learning objective.
            num_class (int): The number of classes for multiclass classification.
            metric (str): The metric to be used for evaluation.
            feature_fraction (float): Fraction of features to be considered for each
                tree.
            early_stopping_rounds (Optional[int]): Activates early stopping.
                Defaults to None.
            verbosity (int): Controls the level of LightGBM's verbosity.
        """

        if num_leaves <= 1:
            num_leaves = 2

[docs]
        self.boosting_type = boosting_type


[docs]
        self.num_leaves = num_leaves


[docs]
        self.learning_rate = learning_rate


[docs]
        self.n_estimators = n_estimators


[docs]
        self.objective = objective


[docs]
        self.num_class = num_class


[docs]
        self.metric = metric


[docs]
        self.feature_fraction = feature_fraction


[docs]
        self.early_stopping_rounds = early_stopping_rounds



[docs]
        self.model: Optional[lgb.LGBMClassifier] = None


[docs]
        self.verbosity = verbosity


[docs]
        self.classes_: Optional[np.ndarray] = None



[docs]
    def fit(
        self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray]
    ) -> "LightGBMClassifier":
        """Fits the LightGBM model.

        This method sanitizes the feature names in `X` before fitting the
        underlying `lgb.LGBMClassifier`.

        Args:
            X (pd.DataFrame): The training input samples.
            y (Union[pd.Series, np.ndarray]): The target values.

        Returns:
            LightGBMClassifier: The fitted estimator.
        """
        self.model = lgb.LGBMClassifier(
            boosting_type=self.boosting_type,
            num_leaves=self.num_leaves,
            learning_rate=self.learning_rate,
            n_estimators=self.n_estimators,
            objective=self.objective,
            num_class=self.num_class,
            metric=self.metric,
            feature_fraction=self.feature_fraction,
            # early_stopping_rounds=self.early_stopping_rounds,
            verbose=self.verbosity,
        )
        # X.columns = X.columns.str.replace('[^a-zA-Z0-9_]', '', regex=True)
        # Change columns names ([LightGBM] Do not support special JSON characters in feature name.)
        new_names = {col: re.sub(r"[^A-Za-z0-9_]+", "", col) for col in X.columns}
        new_n_list = list(new_names.values())
        # [LightGBM] Feature appears more than one time.
        new_names = {
            col: f"{new_col}_{i}" if new_col in new_n_list[:i] else new_col
            for i, (col, new_col) in enumerate(new_names.items())
        }
        X = X.rename(columns=new_names)

        y = np.ravel(y)

        self.model.fit(X, y)
        if self.objective == "binary":
            self.classes_ = np.unique(y)
        return self



[docs]
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """Predicts class labels for samples in X.

        This method sanitizes the feature names in `X` to match those used
        during training.

        Args:
            X (pd.DataFrame): The input samples to predict.

        Raises:
            ValueError: If the model has not been fitted yet.

        Returns:
            np.ndarray: The predicted class labels.
        """
        if self.model is None:
            raise ValueError(
                "Model has not been fitted yet. Call 'fit' before 'predict'."
            )

        # Change columns names ([LightGBM] Do not support special JSON characters in feature name.)
        new_names = {
            col: re.sub(r"[^A-Za-z0-9_]+", "", col) for col in X.columns
        }
        new_n_list = list(new_names.values())
        # [LightGBM] Feature appears more than one time.
        new_names = {
            col: f"{new_col}_{i}" if new_col in new_n_list[:i] else new_col
            for i, (col, new_col) in enumerate(new_names.items())
        }
        X = X.rename(columns=new_names)
        return self.model.predict(X)



[docs]
    def score(self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray]) -> float:
        """Returns the mean accuracy on the given test data and labels.

        Args:
            X (pd.DataFrame): Test samples.
            y (Union[pd.Series, np.ndarray]): True labels for X.

        Raises:
            ValueError: If the model has not been fitted yet.

        Returns:
            float: Mean accuracy of self.predict(X) wrt. y.
        """
        if self.model is None:
            raise ValueError(
                "Model has not been fitted yet. Call 'fit' before 'score'."
            )
        return self.model.score(X, y)