Source code for ml_grid.model_classes.svc_class

import logging
from typing import Any, Dict, List, Optional

import pandas as pd
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from skopt.space import Categorical, Integer, Real

from ml_grid.util import param_space
from ml_grid.util.global_params import global_parameters

logging.getLogger("ml_grid").debug("Imported SVC class")


[docs] class SVCClass: """SVC with support for Bayesian and traditional grid search parameter spaces.""" def __init__( self, X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, parameter_space_size: Optional[str] = None, ): """Initializes the SVCClass. This class requires scaled data. If the input data `X` is not detected as scaled, it will be automatically scaled using `StandardScaler`. Args: X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None. y (Optional[pd.Series]): Target vector for training. Defaults to None. parameter_space_size (Optional[str]): Size of the parameter space for optimization. Defaults to None. Raises: ValueError: If `parameter_space_size` is not a valid key. RuntimeError: If data scaling fails. """
[docs] self.X: Optional[pd.DataFrame] = X
[docs] self.y: Optional[pd.Series] = y
[docs] self.scaler: Optional[StandardScaler] = None
# Enforce scaling for SVM method if not self.is_data_scaled(): try: # Data validation checks before scaling if self.X is None: raise ValueError("Input data X is None - data not loaded properly") # If the dataframe is empty, there's nothing to scale. # The pipeline will likely fail later, but we avoid a scaling error here. if isinstance(self.X, pd.DataFrame) and self.X.empty: raise ValueError( "SVC_class received an empty DataFrame. Halting execution." ) elif not self.X.empty: if not hasattr(self, "scaler") or self.scaler is None: self.scaler = ( StandardScaler() ) # or whichever scaler you're using # Convert sparse matrices if needed if sparse.issparse(self.X): self.X = self.X.toarray() # Ensure numeric data if isinstance(self.X, pd.DataFrame): # type: ignore non_numeric = self.X.select_dtypes(exclude=["number"]).columns if len(non_numeric) > 0: raise ValueError( f"Non-numeric columns found: {list(non_numeric)}" ) # Debug logging # print(f"Scaling data with shape: {self.X.shape}") # print(f"Sample values before scaling:\n{self.X.iloc[:3,:3] if isinstance(self.X, pd.DataFrame) else self.X[:3,:3]}") # Perform scaling self.X = pd.DataFrame( self.scaler.fit_transform(self.X), columns=self.X.columns if hasattr(self.X, "columns") else None, index=self.X.index if hasattr(self.X, "index") else None, ) logging.getLogger("ml_grid").info( "Data scaling completed successfully for SVC" ) except Exception as e: error_msg = f"Data scaling failed: {str(e)}" logging.getLogger("ml_grid").error(error_msg) # Additional debug info if hasattr(self, "X"): logging.getLogger("ml_grid").debug(f"Data type: {type(self.X)}") if hasattr(self.X, "shape"): logging.getLogger("ml_grid").debug(f"Shape: {self.X.shape}") if isinstance(self.X, pd.DataFrame): logging.getLogger("ml_grid").debug( f"Columns: {self.X.columns.tolist()}" ) logging.getLogger("ml_grid").debug( f"Data types:\n{self.X.dtypes}" ) raise RuntimeError(error_msg) from e
[docs] self.algorithm_implementation: SVC = SVC()
[docs] self.method_name: str = "SVC"
# Define the parameter vector space
[docs] self.parameter_vector_space: param_space.ParamSpace = param_space.ParamSpace( parameter_space_size )
[docs] self.parameter_space: List[Dict[str, Any]]
if global_parameters.bayessearch: # Bayesian Optimization: Define parameter space using pre-defined schemes self.parameter_space = [ { "C": Real(1e-5, 1e-2, prior="log-uniform"), "break_ties": Categorical([False]), # 'cache_size': self.parameter_vector_space.param_dict.get("log_large"), # Uncomment if needed # 'class_weight': self.parameter_vector_space.param_dict.get("enum_class_weights"), # Example for enumerating class weights "coef0": Real(1e-5, 1e-2, prior="log-uniform"), "decision_function_shape": Categorical(["ovo"]), "degree": Integer(2, 5), "gamma": Categorical(["scale", "auto"]), "kernel": Categorical(["rbf", "linear", "poly", "sigmoid"]), "max_iter": Integer(100, 1000), # 'probability': Categorical([True, False]), # Uncomment if needed # 'random_state': Categorical([None]), # Example for random state "shrinking": Categorical([True, False]), "tol": Real(1e-5, 1e-2, prior="log-uniform"), "verbose": Categorical([False]), }, { "C": Real(1e-5, 1e-2, prior="log-uniform"), "break_ties": Categorical([True, False]), # 'cache_size': self.parameter_vector_space.param_dict.get("log_large"), # Uncomment if needed # 'class_weight': self.parameter_vector_space.param_dict.get("enum_class_weights"), # Example for enumerating class weights "coef0": Real(1e-5, 1e-2, prior="log-uniform"), "decision_function_shape": Categorical(["ovr"]), "degree": Integer(2, 5), "gamma": Categorical(["scale", "auto"]), "kernel": Categorical(["rbf", "linear", "poly", "sigmoid"]), "max_iter": Integer(100, 1000), # 'probability': Categorical([True, False]), # Uncomment if needed # 'random_state': Categorical([None]), # Example for random state "shrinking": Categorical([True, False]), "tol": Real(1e-5, 1e-2, prior="log-uniform"), "verbose": Categorical([False]), }, ] else: # Traditional Grid Search: Define parameter space using lists # Split into two dictionaries to handle the 'ovo' and 'break_ties' constraint. base_params = { "C": self.parameter_vector_space.param_dict.get("log_small"), "coef0": self.parameter_vector_space.param_dict.get("log_small"), "degree": self.parameter_vector_space.param_dict.get("log_med"), "gamma": ["scale", "auto"], "kernel": ["rbf", "linear", "poly", "sigmoid"], "max_iter": self.parameter_vector_space.param_dict.get( "log_large_long" ), "shrinking": self.parameter_vector_space.param_dict.get("bool_param"), "tol": self.parameter_vector_space.param_dict.get("log_small"), "verbose": [False], } # Dictionary 1: For 'ovr', break_ties can be True or False params_ovr = base_params.copy() params_ovr.update( { "decision_function_shape": ["ovr"], "break_ties": self.parameter_vector_space.param_dict.get( "bool_param" ), } ) # Dictionary 2: For 'ovo', break_ties MUST be False params_ovo = base_params.copy() params_ovo.update( { "decision_function_shape": ["ovo"], "break_ties": [False], } ) # Convert all skopt spaces to lists for GridSearchCV for p in [params_ovr, params_ovo]: for k, v in p.items(): if not isinstance(v, list): p[k] = list(v) self.parameter_space = [params_ovr, params_ovo]
[docs] def is_data_scaled(self) -> bool: """Checks if the feature matrix `X` is scaled. This method determines if the data appears to be scaled by checking if all feature values fall within the [0, 1] or [-1, 1] range. Returns: bool: True if data appears to be scaled, False otherwise. """ if self.X is None or self.X.empty: return False # Select only numeric columns for min/max checks numeric_X = self.X.select_dtypes(include="number") if numeric_X.empty: return False # Calculate the range of values for each feature min_val = numeric_X.min().min() max_val = numeric_X.max().max() # Check if data is scaled to [0, 1] or [-1, 1] range if (min_val >= 0 and max_val <= 1) or (min_val >= -1 and max_val <= 1): return True return False
[docs] def scale_data(self) -> None: """Scales the feature matrix `X` using MinMaxScaler.""" # Initialize MinMaxScaler self.scaler = MinMaxScaler(feature_range=(0, 1)) # Fit and transform the data self.X = pd.DataFrame(self.scaler.fit_transform(self.X), columns=self.X.columns)