Source code for ml_grid.pipeline.data_feature_methods

from typing import List, Union

import numpy as np
import pandas as pd
from PyImpetus import PPIMBC
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC



[docs]
class feature_methods:
    def __init__(self) -> None:
        """Initializes the feature_methods class."""
        pass


[docs]
    def getNfeaturesANOVAF(
        self, n: int, X_train: Union[pd.DataFrame, np.ndarray], y_train: pd.Series
    ) -> List[str]:
        """Gets the top n features based on the ANOVA F-value.

        This method is for classification problems. The ANOVA F-value is
        calculated for each feature in X_train, and the resulting F-values are
        sorted in descending order. The top n features with the highest F-values
        are returned.

        Args:
            n (int): The number of top features to return.
            X_train (Union[pd.DataFrame, np.ndarray]): Training data.
            y_train (pd.Series): Target variable.

        Raises:
            ValueError: If X_train is not a pandas DataFrame or numpy array, or
                if no features can be returned (e.g., all have NaN F-values).

        Returns
            List[str]: A list of column names for the top n features.
        """

        # Check if input is a pandas DataFrame or numpy array
        if isinstance(X_train, pd.DataFrame):
            feature_names = X_train.columns  # Get column names
            X_train = X_train.values  # Convert to numpy array
        elif isinstance(X_train, np.ndarray):
            feature_names = np.arange(
                X_train.shape[1]
            )  # Use indices as column names
        else:
            raise ValueError("X_train must be a pandas DataFrame or numpy array")

        # Calculate F-values for all features at once
        f_values, _ = f_classif(X_train, y_train)

        # Create a list of (feature_name, f_value) tuples, ignoring NaNs
        res = [
            (feature_names[i], f_values[i])
            for i in range(len(feature_names))
            if not np.isnan(f_values[i])
        ]

        # Sort the list based on F-value in descending order
        sortedList = sorted(res, key=lambda x: x[1], reverse=True)

        # Return column names of top n features
        nFeatures = sortedList[:n]  # Get top n features
        finalColNames = [elem[0] for elem in nFeatures]

        # Add a check to ensure that at least one feature is returned.
        # If not, it means all features were filtered out (e.g., all had NaN F-values),
        # which would lead to an empty X_train and cause pipeline failure.
        if not finalColNames:
            # Fallback: if all features were filtered, return the single best one that is not NaN.
            # This can happen if n is too small or all f-values are NaN.
            if sortedList:
                return [sortedList[0][0]]
            else:
                raise ValueError(
                    "getNfeaturesANOVAF returned no features. All features might have NaN F-values."
                )

        return finalColNames





[docs]
    def getNFeaturesMarkovBlanket(
        self,
        n: int,
        X_train: pd.DataFrame,
        y_train: pd.Series,
        num_simul: int = 30,
        cv: int = 5,
        svc_kernel: str = "rbf",
    ) -> List[str]:
        """Gets the top n features from the Markov Blanket (MB) using PyImpetus.

        Args:
            n (int): The number of top features to retrieve.
            X_train (pd.DataFrame): The training input samples.
            y_train (pd.Series): The target values.
            num_simul (int): Number of simulations for stability selection in
                PyImpetus. Defaults to 30.
            cv (int): Number of cross-validation folds. Defaults to 5.
            svc_kernel (str): The kernel to be used by the SVC model.
                Defaults to "rbf".

        Raises:
            TypeError: If X_train is not a pandas DataFrame.

        Returns:
            List[str]: A list containing the names of the top n features from
            the Markov Blanket.
        """
        # Ensure input is a pandas DataFrame to access column names
        if not isinstance(X_train, pd.DataFrame):
            raise TypeError(
                "X_train must be a pandas DataFrame for getNFeaturesMarkovBlanket."
            )
        original_columns = X_train.columns

        # Ensure y_train is a pandas Series, as expected by PyImpetus internally
        if not isinstance(y_train, pd.Series):
            y_train = pd.Series(y_train)
        
        # Initialize the PyImpetus object with desired parameters
        model = PPIMBC(model=SVC(random_state=27, class_weight="balanced", kernel=svc_kernel),
                    p_val_thresh=0.05, 
                    num_simul=num_simul,
                    simul_size=0.2, 
                    simul_type=0, 
                    sig_test_type="non-parametric", 
                    cv=cv,
                    random_state=27, 
                    n_jobs=-1, 
                    verbose=2)
        
        # Fit and transform the training data
        # PyImpetus works with numpy arrays and returns feature indices in model.MB
        model.fit(X_train.values, y_train)
        
        # Get the feature indices from the Markov blanket (MB)
        selected_features = model.MB

        # PyImpetus can return column names (str) or indices (int).
        # We need to handle both cases to get the final list of feature names.
        if all(isinstance(f, int) for f in selected_features):
            # It returned indices, so map them to names
            feature_names = [original_columns[i] for i in selected_features][:n]
        else:
            # It returned names directly
            feature_names = list(selected_features)[:n]

        # Fallback: If feature selection returns an empty list, but the model found features,
        # return the single most important one. This prevents pipeline failure.
        if not feature_names and selected_features:
            # Re-evaluate the first selected feature to ensure it's a valid name
            first_feature = selected_features[0]
            feature_names = [original_columns[first_feature] if isinstance(first_feature, int) else first_feature]
        
        return feature_names