Source code for ml_grid.util.grid_param_space

"""Defines the Grid class for creating a hyperparameter search space."""

import itertools as it
import logging
import random
from typing import Dict, Generator, List, Optional, Union

from ml_grid.util.global_params import global_parameters



[docs]
class Grid:
    """Generates and manages a grid of hyperparameter settings for experiments."""

    def __init__(self, sample_n: Optional[int] = 1000):
        """Initializes the Grid object.

        This class creates a comprehensive grid of settings by taking the
        Cartesian product of all specified hyperparameters. It then randomly
        samples a specified number of these settings to create a manageable
        list for experimentation.

        Args:
            sample_n (Optional[int], optional): The number of random settings to
                sample from the full grid. Defaults to 1000.
        """


[docs]
        self.global_params = global_parameters


[docs]
        self.verbose = self.global_params.verbose


[docs]
        self.logger = logging.getLogger("ml_grid")


        if sample_n is None:
            self.sample_n = 1000
        else:
            self.sample_n = sample_n

        self.logger.info(f"Feature space slice sample_n: {self.sample_n}")

        # Default grid
        # User can update grid dictionary on the object

[docs]
        self.grid = {
            "resample": ["undersample", "oversample", None],
            "scale": [True, False],
            "feature_n": [100, 95, 75, 50, 25, 5],
            "param_space_size": ["medium", "xsmall"],
            "n_unique_out": [10],
            "outcome_var_n": ["1"],
            "percent_missing": [99, 95, 80],  # n/100 ex 95 for 95% # 99.99, 99.5, 9
            "corr": [0.98, 0.85, 0.5, 0.25],
            "feature_selection_method": ["anova", "markov_blanket"],
            "use_embedding": [True, False],
            "embedding_method": ["pca", "svd"],
            "embedding_dim": [32, 64, 128],
            "scale_features_before_embedding": [True],
            "cache_embeddings": [False],
            "data": [
                {
                    "age": [True, False],
                    "sex": [True, False],
                    "bmi": [True],
                    "ethnicity": [True, False],
                    "bloods": [True, False],
                    "diagnostic_order": [True, False],
                    "drug_order": [True, False],
                    "annotation_n": [True, False],
                    "meta_sp_annotation_n": [True, False],
                    "annotation_mrc_n": [True, False],
                    "meta_sp_annotation_mrc_n": [True, False],
                    "core_02": [False],
                    "bed": [False],
                    "vte_status": [True],
                    "hosp_site": [True],
                    "core_resus": [False],
                    "news": [False],
                    "date_time_stamp": [False],
                }
            ],
        }


        def c_prod(d: Union[Dict, List]) -> Generator[Dict, None, None]:
            """Recursively generates the Cartesian product of a nested dictionary.

            Args:
                d (Union[Dict, List]): The dictionary or list of settings.

            Yields:
                Generator[Dict, None, None]: A generator of dictionaries, each
                representing a unique combination of settings.
            """
            if isinstance(d, list):
                for i in d:
                    yield from ([i] if not isinstance(i, (dict, list)) else c_prod(i))
            else:
                for i in it.product(*map(c_prod, d.values())):
                    yield dict(zip(d.keys(), i))


[docs]
        self.settings_list = list(c_prod(self.grid))

        full_settings_size = len(self.settings_list)
        self.logger.info(f"Full settings_list size: {full_settings_size}")

        random.shuffle(self.settings_list)

        # Ensure sample_n is not greater than the number of available settings
        sample_size = min(self.sample_n, full_settings_size)
        if self.sample_n > full_settings_size:
            self.logger.warning(
                f"sample_n ({self.sample_n}) is larger than the number of settings ({full_settings_size}). Using all settings."
            )

        self.settings_list = random.sample(self.settings_list, sample_size)


[docs]
        self.settings_list_iterator = iter(self.settings_list)



        # This is likely not properly functioning. Does not return iteration, instead reinitiates.
        # Don't need to subsample, can just generate n number of random choices from grid space.
        # function can just return random choice from grid space, terminate at the other end once limit reached.