Source code for ml_grid.pipeline.data_constant_columns

import logging
from typing import List, Optional, Tuple, Union

import numpy as np
import pandas as pd


[docs] def remove_constant_columns( X: pd.DataFrame, drop_list: Optional[List[str]] = None, verbose: int = 1 ) -> List[str]: """Identifies columns in a DataFrame where all values are the same. Args: X (pd.DataFrame): DataFrame to check for constant columns. drop_list (Optional[List[str]], optional): A list of columns already marked for dropping. Defaults to None. verbose (int, optional): Controls the verbosity of logging. Defaults to 1. Returns: List[str]: Updated list of columns to drop, including constant columns. Raises: AssertionError: If X is None. """ logger = logging.getLogger("ml_grid") try: if verbose > 1: # verbose is passed but not used well here. logger.info("Identifying constant columns") assert X is not None, "Null pointer exception: X cannot be None." # Initialize drop_list if not provided if drop_list is None: drop_list = [] # Identify constant columns constant_columns = [col for col in X.columns if X[col].nunique() == 1] if constant_columns: if verbose > 1: logger.info(f"Constant columns identified: {constant_columns}") # Add constant columns to drop_list drop_list.extend(constant_columns) except AssertionError as e: logger.error(str(e)) raise except Exception as e: logger.error(f"Unhandled exception: {e}", exc_info=True) raise return drop_list
[docs] def remove_constant_columns_with_debug( X_train: Union[pd.DataFrame, np.ndarray], X_test: Union[pd.DataFrame, np.ndarray], X_test_orig: Union[pd.DataFrame, np.ndarray], verbosity: int = 2, ) -> Tuple[ Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], Union[pd.DataFrame, np.ndarray], ]: """Removes constant columns from training and testing datasets. This function identifies columns that have zero variance in the training set and removes them from all provided datasets (X_train, X_test, X_test_orig). It supports both pandas DataFrames and NumPy arrays, including 3D arrays for time series data. IMPORTANT: Only checks X_train for constant columns to prevent data leakage. A column is constant if it has <= 1 unique value in X_train. Args: X_train (Union[pd.DataFrame, np.ndarray]): Training feature data. X_test (Union[pd.DataFrame, np.ndarray]): Testing feature data. X_test_orig (Union[pd.DataFrame, np.ndarray]): Original (unsplit) testing feature data. verbosity (int, optional): Controls the verbosity of debug messages. Defaults to 2. Returns: Tuple[Union[pd.DataFrame, np.ndarray], ...]: A tuple containing the modified X_train, X_test, and X_test_orig datasets with constant columns removed. """ logger = logging.getLogger("ml_grid") if verbosity > 0: # Debug message: Initial shapes of X_train, X_test, X_test_orig logger.debug(f"Initial X_train shape: {X_train.shape}") logger.debug(f"Initial X_test shape: {X_test.shape}") logger.debug(f"Initial X_test_orig shape: {X_test_orig.shape}") is_pandas = isinstance(X_train, pd.DataFrame) if is_pandas: # Identify constant columns in X_train # A column is constant if it has 1 or fewer unique values (excluding NaN) constant_columns = [] for col in X_train.columns: try: # Use nunique() without dropna to be more conservative # A column with all NaN is also constant and should be dropped n_unique = X_train[col].nunique(dropna=False) if n_unique <= 1: constant_columns.append(col) if verbosity > 1: logger.debug( f"Column '{col}' is constant: nunique={n_unique}, sample values: {X_train[col].head()}" ) # Additional check for numeric columns with zero variance elif pd.api.types.is_numeric_dtype(X_train[col]): try: col_std = X_train[col].std() if col_std == 0 or ( pd.notna(col_std) and np.isclose(col_std, 0) ): constant_columns.append(col) if verbosity > 1: logger.debug( f"Column '{col}' has zero variance: std={col_std}" ) except Exception as e: if verbosity > 1: logger.warning( f"Could not calculate std for column '{col}': {e}" ) except Exception as e: if verbosity > 1: logger.warning(f"Error checking column '{col}': {e}") if verbosity > 1: logger.debug("\nUnique value counts in X_train:") for col in X_train.columns: logger.debug( f" {col}: {X_train[col].nunique(dropna=False)} unique values" ) if verbosity > 0: logger.info(f"\nConstant columns identified in X_train: {constant_columns}") logger.info( f"Number of constant columns to remove: {len(constant_columns)}" ) if constant_columns: X_train = X_train.drop(columns=constant_columns, errors="ignore") X_test = X_test.drop(columns=constant_columns, errors="ignore") X_test_orig = X_test_orig.drop(columns=constant_columns, errors="ignore") else: # Handle numpy arrays # Determine variance calculation axis based on dimensions if X_train.ndim == 3: # For 3D time series data (e.g., from aeon: samples, features, timesteps), # calculate variance for each feature across samples and timesteps. var_axis = (0, 2) else: # For 2D data, calculate variance across samples (axis 0). var_axis = 0 train_variances = X_train.var(axis=var_axis) constant_indices_train = np.where(train_variances == 0)[0] if verbosity > 0: logger.info( f"Constant feature indices in X_train: {list(constant_indices_train)}" ) # A feature is constant if it has no variance in the training set. # We should not consider the test set variance, as a small test set # might misleadingly have constant features. constant_indices = constant_indices_train # Create a boolean mask for features to keep num_features = X_train.shape[1] keep_mask = np.ones(num_features, dtype=bool) keep_mask[constant_indices] = False # Apply the mask to remove constant features if X_train.ndim == 3: X_train = X_train[:, keep_mask, :] X_test = X_test[:, keep_mask, :] X_test_orig = X_test_orig[:, keep_mask, :] else: # 2D array X_train = X_train[:, keep_mask] X_test = X_test[:, keep_mask] X_test_orig = X_test_orig[:, keep_mask] if verbosity > 0: # Debug message: Shape after removing constant columns from X_train, X_test, X_test_orig logger.debug( f"Shape of X_train after removing constant columns: {X_train.shape}" ) logger.debug(f"Shape of X_test after removing constant columns: {X_test.shape}") logger.debug( f"Shape of X_test_orig after removing constant columns: {X_test_orig.shape}" ) # Return the modified X_train, X_test, and X_test_orig, with y_test_orig unchanged return X_train, X_test, X_test_orig
# Example usage with verbosity level 2 (most verbose) # X_train, X_test, X_test_orig = remove_constant_columns_with_debug(X_train, X_test, X_test_orig, verbosity=2)