Source code for ml_grid.model_classes.svc_class

from scipy import sparse
from typing import Optional
from ml_grid.util import param_space
from ml_grid.util.global_params import global_parameters
from sklearn.svm import SVC
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from skopt.space import Real, Categorical

print("Imported SVC class")


[docs] class SVC_class: """SVC with support for Bayesian and traditional grid search parameter spaces.""" def __init__( self, X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, parameter_space_size: Optional[str] = None, ): """Initializes the SVC_class. This class requires scaled data. If the input data `X` is not detected as scaled, it will be automatically scaled using `StandardScaler`. Args: X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None. y (Optional[pd.Series]): Target vector for training. Defaults to None. parameter_space_size (Optional[str]): Size of the parameter space for optimization. Defaults to None. """
[docs] self.X = X
[docs] self.y = y
# Enforce scaling for SVM method if not self.is_data_scaled(): try: # Data validation checks before scaling if self.X is None: raise ValueError("Input data X is None - data not loaded properly") if isinstance(self.X, pd.DataFrame) and self.X.empty: #raise ValueError("Input data X is an empty DataFrame") print("warn: SVC data scaling, X data is empty") if( self.X.empty == False): if not hasattr(self, 'scaler'): self.scaler = StandardScaler() # or whichever scaler you're using # Convert sparse matrices if needed if sparse.issparse(self.X): self.X = self.X.toarray() # Ensure numeric data if isinstance(self.X, pd.DataFrame): non_numeric = self.X.select_dtypes(exclude=['number']).columns if len(non_numeric) > 0: raise ValueError(f"Non-numeric columns found: {list(non_numeric)}") # Debug logging #print(f"Scaling data with shape: {self.X.shape}") #print(f"Sample values before scaling:\n{self.X.iloc[:3,:3] if isinstance(self.X, pd.DataFrame) else self.X[:3,:3]}") # Perform scaling self.X = pd.DataFrame( self.scaler.fit_transform(self.X), columns=self.X.columns if hasattr(self.X, 'columns') else None, index=self.X.index if hasattr(self.X, 'index') else None ) print("Data scaling completed successfully") except Exception as e: error_msg = f"Data scaling failed: {str(e)}" print(error_msg) # Additional debug info if hasattr(self, 'X'): print(f"Data type: {type(self.X)}") if hasattr(self.X, 'shape'): print(f"Shape: {self.X.shape}") if isinstance(self.X, pd.DataFrame): print(f"Columns: {self.X.columns.tolist()}") print(f"Data types:\n{self.X.dtypes}") raise RuntimeError(error_msg) from e
[docs] self.algorithm_implementation = SVC()
[docs] self.method_name = "SVC"
# Define the parameter vector space
[docs] self.parameter_vector_space = param_space.ParamSpace(parameter_space_size)
if global_parameters.bayessearch: # Bayesian Optimization: Define parameter space using pre-defined schemes self.parameter_space = [{ "C": self.parameter_vector_space.param_dict.get("log_small"), "break_ties": Categorical([False]), # 'cache_size': self.parameter_vector_space.param_dict.get("log_large"), # Uncomment if needed # 'class_weight': self.parameter_vector_space.param_dict.get("enum_class_weights"), # Example for enumerating class weights "coef0": self.parameter_vector_space.param_dict.get("log_small"), "decision_function_shape": Categorical(["ovo"]), "degree": self.parameter_vector_space.param_dict.get("log_med"), "gamma": Categorical(["scale", "auto"]), "kernel": Categorical(["rbf", "linear", "poly", "sigmoid"]), "max_iter": self.parameter_vector_space.param_dict.get("log_large_long"), # 'probability': Categorical([True, False]), # Uncomment if needed # 'random_state': Categorical([None]), # Example for random state "shrinking": self.parameter_vector_space.param_dict.get("bool_param"), "tol": self.parameter_vector_space.param_dict.get("log_small"), "verbose": Categorical([True, False]), },{ "C": self.parameter_vector_space.param_dict.get("log_small"), "break_ties": Categorical([True, False]), # 'cache_size': self.parameter_vector_space.param_dict.get("log_large"), # Uncomment if needed # 'class_weight': self.parameter_vector_space.param_dict.get("enum_class_weights"), # Example for enumerating class weights "coef0": self.parameter_vector_space.param_dict.get("log_small"), "decision_function_shape": Categorical(["ovr"]), "degree": self.parameter_vector_space.param_dict.get("log_med"), "gamma": Categorical(["scale", "auto"]), "kernel": Categorical(["rbf", "linear", "poly", "sigmoid"]), "max_iter": self.parameter_vector_space.param_dict.get("log_large_long"), # 'probability': Categorical([True, False]), # Uncomment if needed # 'random_state': Categorical([None]), # Example for random state "shrinking": self.parameter_vector_space.param_dict.get("bool_param"), "tol": self.parameter_vector_space.param_dict.get("log_small"), "verbose": Categorical([True, False]), } ] else: # Traditional Grid Search: Define parameter space using lists self.parameter_space = { "C": list(self.parameter_vector_space.param_dict.get("log_small")), "break_ties": list( self.parameter_vector_space.param_dict.get("bool_param") ), # 'cache_size': [200], # Uncomment if needed # 'class_weight': [None, "balanced"] # + [{0: w} for w in [1, 2, 4, 6, 10]], # Enumerate class weights "coef0": list(self.parameter_vector_space.param_dict.get("log_small")), "decision_function_shape": ["ovr", "ovo"], "degree": list(self.parameter_vector_space.param_dict.get("log_med")), "gamma": ["scale", "auto"], "kernel": ["rbf", "linear", "poly", "sigmoid"], "max_iter": list( self.parameter_vector_space.param_dict.get("log_large_long") ), # 'probability': [False], # Uncomment if needed # 'random_state': [None], # Example for random state "shrinking": list( self.parameter_vector_space.param_dict.get("bool_param") ), "tol": list(self.parameter_vector_space.param_dict.get("log_small")), "verbose": [False], } return None
[docs] def is_data_scaled(self) -> bool: """Checks if the feature matrix `X` is scaled. This method determines if the data appears to be scaled by checking if all feature values fall within the [0, 1] or [-1, 1] range. Returns: bool: True if data appears to be scaled, False otherwise. """ if self.X is None or self.X.empty: return False # Select only numeric columns for min/max checks numeric_X = self.X.select_dtypes(include="number") if numeric_X.empty: return False # Calculate the range of values for each feature min_val = numeric_X.min().min() max_val = numeric_X.max().max() # Check if data is scaled to [0, 1] or [-1, 1] range if (min_val >= 0 and max_val <= 1) or (min_val >= -1 and max_val <= 1): return True return False
[docs] def scale_data(self) -> None: """Scales the feature matrix `X` using MinMaxScaler. """ # Initialize MinMaxScaler self.scaler = MinMaxScaler(feature_range=(0, 1)) # Fit and transform the data self.X = pd.DataFrame(self.scaler.fit_transform(self.X), columns=self.X.columns)