import logging
from typing import Any, Dict, List, Optional
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from skopt.space import Categorical, Integer, Real
from ml_grid.util import param_space
from ml_grid.util.global_params import global_parameters
logging.getLogger("ml_grid").debug("Imported SVC class")
[docs]
class SVCClass:
"""SVC with support for Bayesian and traditional grid search parameter spaces."""
def __init__(
self,
X: Optional[pd.DataFrame] = None,
y: Optional[pd.Series] = None,
parameter_space_size: Optional[str] = None,
):
"""Initializes the SVCClass.
This class requires scaled data. If the input data `X` is not detected
as scaled, it will be automatically scaled using `StandardScaler`.
Args:
X (Optional[pd.DataFrame]): Feature matrix for training. Defaults to None.
y (Optional[pd.Series]): Target vector for training. Defaults to None.
parameter_space_size (Optional[str]): Size of the parameter space for
optimization. Defaults to None.
Raises:
ValueError: If `parameter_space_size` is not a valid key.
RuntimeError: If data scaling fails.
"""
[docs]
self.X: Optional[pd.DataFrame] = X
[docs]
self.y: Optional[pd.Series] = y
[docs]
self.scaler: Optional[StandardScaler] = None
# Enforce scaling for SVM method
if not self.is_data_scaled():
try:
# Data validation checks before scaling
if self.X is None:
raise ValueError("Input data X is None - data not loaded properly")
# If the dataframe is empty, there's nothing to scale.
# The pipeline will likely fail later, but we avoid a scaling error here.
if isinstance(self.X, pd.DataFrame) and self.X.empty:
raise ValueError(
"SVC_class received an empty DataFrame. Halting execution."
)
elif not self.X.empty:
if not hasattr(self, "scaler") or self.scaler is None:
self.scaler = (
StandardScaler()
) # or whichever scaler you're using
# Convert sparse matrices if needed
if sparse.issparse(self.X):
self.X = self.X.toarray()
# Ensure numeric data
if isinstance(self.X, pd.DataFrame): # type: ignore
non_numeric = self.X.select_dtypes(exclude=["number"]).columns
if len(non_numeric) > 0:
raise ValueError(
f"Non-numeric columns found: {list(non_numeric)}"
)
# Debug logging
# print(f"Scaling data with shape: {self.X.shape}")
# print(f"Sample values before scaling:\n{self.X.iloc[:3,:3] if isinstance(self.X, pd.DataFrame) else self.X[:3,:3]}")
# Perform scaling
self.X = pd.DataFrame(
self.scaler.fit_transform(self.X),
columns=self.X.columns if hasattr(self.X, "columns") else None,
index=self.X.index if hasattr(self.X, "index") else None,
)
logging.getLogger("ml_grid").info(
"Data scaling completed successfully for SVC"
)
except Exception as e:
error_msg = f"Data scaling failed: {str(e)}"
logging.getLogger("ml_grid").error(error_msg)
# Additional debug info
if hasattr(self, "X"):
logging.getLogger("ml_grid").debug(f"Data type: {type(self.X)}")
if hasattr(self.X, "shape"):
logging.getLogger("ml_grid").debug(f"Shape: {self.X.shape}")
if isinstance(self.X, pd.DataFrame):
logging.getLogger("ml_grid").debug(
f"Columns: {self.X.columns.tolist()}"
)
logging.getLogger("ml_grid").debug(
f"Data types:\n{self.X.dtypes}"
)
raise RuntimeError(error_msg) from e
[docs]
self.algorithm_implementation: SVC = SVC()
[docs]
self.method_name: str = "SVC"
# Define the parameter vector space
[docs]
self.parameter_vector_space: param_space.ParamSpace = param_space.ParamSpace(
parameter_space_size
)
[docs]
self.parameter_space: List[Dict[str, Any]]
if global_parameters.bayessearch:
# Bayesian Optimization: Define parameter space using pre-defined schemes
self.parameter_space = [
{
"C": Real(1e-5, 1e-2, prior="log-uniform"),
"break_ties": Categorical([False]),
# 'cache_size': self.parameter_vector_space.param_dict.get("log_large"), # Uncomment if needed
# 'class_weight': self.parameter_vector_space.param_dict.get("enum_class_weights"), # Example for enumerating class weights
"coef0": Real(1e-5, 1e-2, prior="log-uniform"),
"decision_function_shape": Categorical(["ovo"]),
"degree": Integer(2, 5),
"gamma": Categorical(["scale", "auto"]),
"kernel": Categorical(["rbf", "linear", "poly", "sigmoid"]),
"max_iter": Integer(100, 1000),
# 'probability': Categorical([True, False]), # Uncomment if needed
# 'random_state': Categorical([None]), # Example for random state
"shrinking": Categorical([True, False]),
"tol": Real(1e-5, 1e-2, prior="log-uniform"),
"verbose": Categorical([False]),
},
{
"C": Real(1e-5, 1e-2, prior="log-uniform"),
"break_ties": Categorical([True, False]),
# 'cache_size': self.parameter_vector_space.param_dict.get("log_large"), # Uncomment if needed
# 'class_weight': self.parameter_vector_space.param_dict.get("enum_class_weights"), # Example for enumerating class weights
"coef0": Real(1e-5, 1e-2, prior="log-uniform"),
"decision_function_shape": Categorical(["ovr"]),
"degree": Integer(2, 5),
"gamma": Categorical(["scale", "auto"]),
"kernel": Categorical(["rbf", "linear", "poly", "sigmoid"]),
"max_iter": Integer(100, 1000),
# 'probability': Categorical([True, False]), # Uncomment if needed
# 'random_state': Categorical([None]), # Example for random state
"shrinking": Categorical([True, False]),
"tol": Real(1e-5, 1e-2, prior="log-uniform"),
"verbose": Categorical([False]),
},
]
else:
# Traditional Grid Search: Define parameter space using lists
# Split into two dictionaries to handle the 'ovo' and 'break_ties' constraint.
base_params = {
"C": self.parameter_vector_space.param_dict.get("log_small"),
"coef0": self.parameter_vector_space.param_dict.get("log_small"),
"degree": self.parameter_vector_space.param_dict.get("log_med"),
"gamma": ["scale", "auto"],
"kernel": ["rbf", "linear", "poly", "sigmoid"],
"max_iter": self.parameter_vector_space.param_dict.get(
"log_large_long"
),
"shrinking": self.parameter_vector_space.param_dict.get("bool_param"),
"tol": self.parameter_vector_space.param_dict.get("log_small"),
"verbose": [False],
}
# Dictionary 1: For 'ovr', break_ties can be True or False
params_ovr = base_params.copy()
params_ovr.update(
{
"decision_function_shape": ["ovr"],
"break_ties": self.parameter_vector_space.param_dict.get(
"bool_param"
),
}
)
# Dictionary 2: For 'ovo', break_ties MUST be False
params_ovo = base_params.copy()
params_ovo.update(
{
"decision_function_shape": ["ovo"],
"break_ties": [False],
}
)
# Convert all skopt spaces to lists for GridSearchCV
for p in [params_ovr, params_ovo]:
for k, v in p.items():
if not isinstance(v, list):
p[k] = list(v)
self.parameter_space = [params_ovr, params_ovo]
[docs]
def is_data_scaled(self) -> bool:
"""Checks if the feature matrix `X` is scaled.
This method determines if the data appears to be scaled by checking if all
feature values fall within the [0, 1] or [-1, 1] range.
Returns:
bool: True if data appears to be scaled, False otherwise.
"""
if self.X is None or self.X.empty:
return False
# Select only numeric columns for min/max checks
numeric_X = self.X.select_dtypes(include="number")
if numeric_X.empty:
return False
# Calculate the range of values for each feature
min_val = numeric_X.min().min()
max_val = numeric_X.max().max()
# Check if data is scaled to [0, 1] or [-1, 1] range
if (min_val >= 0 and max_val <= 1) or (min_val >= -1 and max_val <= 1):
return True
return False
[docs]
def scale_data(self) -> None:
"""Scales the feature matrix `X` using MinMaxScaler."""
# Initialize MinMaxScaler
self.scaler = MinMaxScaler(feature_range=(0, 1))
# Fit and transform the data
self.X = pd.DataFrame(self.scaler.fit_transform(self.X), columns=self.X.columns)