Source code for ml_grid.util.synthetic_data_generator

"""
Module for generating synthetic datasets that mimic the structure of real-world
data used in the ml-grid pipeline.
"""

import logging
import random
from typing import List

import numpy as np
import pandas as pd
from tqdm import tqdm


[docs] class SyntheticDataGenerator: """ Generates a synthetic DataFrame for testing the ml-grid pipeline. This class creates a dataset with realistic column names and structures, including various feature types (blood tests, annotations, etc.), multiple configurable outcome variables, and a controllable signal-to-noise ratio. Attributes: n_rows (int): The number of rows (samples) in the dataset. n_features (int): The total number of feature columns to generate. n_outcome_vars (int): The number of outcome variable columns to create. feature_strength (float): A factor to control the influence of "important" features on the outcome variables. Higher values create a stronger signal. percent_important_features (float): The percentage of total features that will be correlated with the outcome variables. percent_binary_features (float): The percentage of features that should be binary (0 or 1). percent_int_features (float): The percentage of features that should be integer-based (e.g., counts). """ def __init__( self, n_rows: int = 1000, n_features: int = 150, n_outcome_vars: int = 3, feature_strength: float = 0.8, percent_important_features: float = 0.1, percent_binary_features: float = 0.15, percent_int_features: float = 0.2, verbose: bool = True, ): """ Initializes the SyntheticDataGenerator with specified parameters. Args: n_rows (int): Number of rows for the synthetic dataset. n_features (int): Number of feature columns to generate. n_outcome_vars (int): Number of outcome variables to generate. feature_strength (float): Strength of the signal from important features. Must be between 0 and 1. percent_important_features (float): Percentage of features that should be predictive of the outcome. percent_binary_features (float): Percentage of features to be binary. percent_int_features (float): Percentage of features to be integer-based. verbose (bool): If True, prints generation status messages. """ if not 0 <= feature_strength <= 1: raise ValueError("feature_strength must be between 0 and 1.") if not 0 <= percent_binary_features + percent_int_features <= 1: raise ValueError( "The sum of binary and int feature percentages must be <= 1." )
[docs] self.n_rows = n_rows
[docs] self.n_features = n_features
[docs] self.n_outcome_vars = n_outcome_vars
[docs] self.feature_strength = feature_strength
[docs] self.percent_important_features = percent_important_features
[docs] self.percent_binary_features = percent_binary_features
[docs] self.percent_int_features = percent_int_features
[docs] self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO if verbose else logging.WARNING) # Based on ml_grid/pipeline/column_names.py and sample data self._feature_prefixes = [ "Alkaline Phosphatase", "RBC", "PLT", "Sodium", "Potassium", "C-reactive Protein", "Glycated Hb", "White Cell Count", "Monocytes", "MCHC.", "Calcium measurement", "Neutrophils", "Insertion - action (qualifier value)", "Routine (qualifier value)", "General treatment (procedure)", "Research fellow (occupation)", "Phlebotomy (procedure)", "Date of birth (observable entity)", "Antibiotic (product)", "Hypercholesterolemia (disorder)", "Sinus rhythm (finding)", "Capsule (basic dose form)", "Transplantation of liver (procedure)", "History of clinical finding", ] self._feature_suffixes = [ "_mean", "_median", "_mode", "_std", "_num-tests", "_days-since-last-test", "_max", "_min", "_most-recent", "_earliest-test", "_days-between-first-last", "_contains-extreme-low", "_contains-extreme-high", "_num-diagnostic-order", "_count", "_count_subject_present", "_count_subject_not_present", "_count_relative_present", "_count_relative_not_present", "_count_subject_present_mrc_cs", ] # Suffixes that imply integer or binary types self._int_suffixes = ["_count", "_num-tests", "_num-diagnostic-order"] self._binary_suffixes = ["_contains-extreme-low", "_contains-extreme-high"] self._special_features = [ "age", "male", "bmi_value", "census_ethnicity_white", "core_02_val", "bed_type_A", "vte_status_1", "hosp_site_X", "core_resus_status", "news_score", "client_idcode", ] def _generate_column_names(self) -> List[str]: """Generates a list of realistic, structured feature names.""" self.logger.info(f"Generating {self.n_features} column names...") generated_names = [] # Add some special features first num_special = min(len(self._special_features), 5) generated_names.extend(random.sample(self._special_features, num_special)) # Calculate how many more names we need remaining = self.n_features - len(generated_names) if remaining <= 0: return generated_names[: self.n_features] # 1. Generate all possible clean combinations (Prefix + Suffix) # This avoids _r1 suffixes unless we run out of unique clean names clean_combinations = [ f"{prefix}{suffix}" for prefix in self._feature_prefixes for suffix in self._feature_suffixes ] random.shuffle(clean_combinations) # Take as many as we need from clean combinations num_from_clean = min(len(clean_combinations), remaining) generated_names.extend(clean_combinations[:num_from_clean]) remaining -= num_from_clean # 2. If we still need more, use rounds (_r1, _r2, etc.) round_num = 1 max_rounds = 10 while remaining > 0 and round_num < max_rounds: round_candidates = [ f"{prefix}_r{round_num}{suffix}" for prefix in self._feature_prefixes for suffix in self._feature_suffixes ] random.shuffle(round_candidates) take_round = min(remaining, len(round_candidates)) generated_names.extend(round_candidates[:take_round]) remaining -= take_round round_num += 1 # Final shuffle random.shuffle(generated_names) return generated_names[: self.n_features] def _generate_typed_data(self, feature_names: List[str]) -> np.ndarray: """ Generates data with appropriate types based on column names. This is significantly faster than modifying DataFrame columns after creation. """ # Pre-categorize columns to avoid repeated string matching age_cols = [] binary_cols = [] bmi_cols = [] int_cols = [] binary_suffix_cols = [] normal_cols = [] self.logger.info("Categorizing columns...") for idx, col in enumerate(feature_names): if col == "age": age_cols.append(idx) elif col == "male" or "vte_status" in col or "bed_type" in col: binary_cols.append(idx) elif col == "bmi_value": bmi_cols.append(idx) elif any(s in col for s in self._int_suffixes): int_cols.append(idx) elif any(s in col for s in self._binary_suffixes): binary_suffix_cols.append(idx) else: normal_cols.append(idx) # Pre-allocate array data = np.empty((self.n_rows, len(feature_names)), dtype=np.float32) # Generate data in bulk for each category self.logger.info("Generating typed data...") if age_cols: for idx in age_cols: data[:, idx] = np.random.randint(20, 90, size=self.n_rows) if binary_cols: for idx in binary_cols: data[:, idx] = np.random.randint(0, 2, size=self.n_rows) if bmi_cols: for idx in bmi_cols: data[:, idx] = np.random.uniform(18, 45, size=self.n_rows) if int_cols: for idx in int_cols: data[:, idx] = np.random.poisson(5, size=self.n_rows) * random.randint( 1, 5 ) if binary_suffix_cols: for idx in binary_suffix_cols: data[:, idx] = np.random.randint(0, 2, size=self.n_rows) # Generate all normal columns at once if normal_cols: self.logger.info( f"Generating {len(normal_cols)} normal distribution columns..." ) normal_data = np.random.randn(self.n_rows, len(normal_cols)).astype( np.float32 ) data[:, normal_cols] = normal_data return data
[docs] def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]: """ Generates and returns the synthetic DataFrame and a map of important features. Returns: tuple[pd.DataFrame, dict[str, list[str]]]: - The fully generated synthetic dataset. - A dictionary mapping each outcome variable to its list of important features. """ self.logger.info( f"Starting generation: {self.n_rows} rows × {self.n_features} features" ) # 1. Generate feature names self.logger.info("Generating column names...") feature_names = self._generate_column_names() # 2. Generate typed data directly (much faster than modifying after) data = self._generate_typed_data(feature_names) self.logger.info("Creating DataFrame...") df = pd.DataFrame(data, columns=feature_names) # Dictionary to hold outcome variables and metadata new_cols_dict = {} outcome_to_features_map = {} # 3. Determine number of important features n_important = max(1, int(self.n_features * self.percent_important_features)) self.logger.info(f"Generating {self.n_outcome_vars} outcome variables.") # 4. Generate outcome variables with progress bar for i in tqdm( range(1, self.n_outcome_vars + 1), desc="Creating outcomes", disable=not self.logger.isEnabledFor(logging.INFO), ): outcome_col_name = f"outcome_var_{i}" # Select a unique set of important features for *this* outcome important_features = ( df.columns.to_series() .sample( n=n_important, random_state=42 + i # Use index `i` to vary the seed ) .tolist() ) outcome_to_features_map[outcome_col_name] = important_features if i <= 3 or i == self.n_outcome_vars: # Only log first 3 and last self.logger.info( f" For '{outcome_col_name}', selected {len(important_features)} important features" ) # Create signal from important features (vectorized) signal = df[important_features].values.sum(axis=1) * self.feature_strength # Create noise noise_strength = 1 - self.feature_strength noise = ( np.random.randn(self.n_rows).astype(np.float32) * noise_strength * signal.std() ) # Combine signal and noise, then create binary outcome combined_signal = signal + noise threshold = np.median(combined_signal) outcome = (combined_signal > threshold).astype(np.int8) # Randomly flip 10% of outcomes flip_mask = np.random.rand(self.n_rows) < 0.1 outcome[flip_mask] = 1 - outcome[flip_mask] new_cols_dict[outcome_col_name] = outcome # 5. Add metadata columns self.logger.info("Adding metadata columns...") if "client_idcode" not in df.columns: new_cols_dict["client_idcode"] = [f"id_{j}" for j in range(self.n_rows)] new_cols_dict["Unnamed: 0"] = np.arange(self.n_rows, dtype=np.int32) # 6. Concatenate all at once self.logger.info("Concatenating final DataFrame...") new_cols_df = pd.DataFrame(new_cols_dict, index=df.index) df = pd.concat( [ new_cols_df[["Unnamed: 0"]], df, new_cols_df.drop(columns=["Unnamed: 0"]), ], axis=1, ) # 7. Introduce missing values (vectorized per column) self.logger.info("Introducing missing values...") feature_cols = [ col for col in df.columns if not ( col.startswith("outcome_var") or col == "client_idcode" or col == "Unnamed: 0" ) ] cols_with_nans = random.sample(feature_cols, int(len(feature_cols) * 0.15)) for col in tqdm( cols_with_nans, desc="Adding NaNs", disable=not self.logger.isEnabledFor(logging.INFO), ): frac = random.uniform(0.01, 0.2) n_nans = int(self.n_rows * frac) nan_indices = np.random.choice(self.n_rows, size=n_nans, replace=False) df.loc[nan_indices, col] = np.nan self.logger.info(f"Generation complete! Shape: {df.shape}") return df, outcome_to_features_map
[docs] class SyntheticTSDataGenerator: """ Generates a synthetic longitudinal (time-series) dataset for testing the ml-grid pipeline. The output is a long-format 2D DataFrame where each row represents one observation for one patient at one timestamp — mirroring the real data format exactly. Multiple rows share the same ``client_idcode``, forming that patient's time series. Feature columns and outcome generation follow the same signal/noise approach as ``SyntheticDataGenerator``. Attributes: n_instances (int): Number of unique patients (client_idcodes). n_timepoints (int): Number of timestamped rows per patient. n_features (int): Number of clinical feature columns to generate. n_outcome_vars (int): Number of binary outcome columns to generate. feature_strength (float): Controls signal strength (0 = pure noise, 1 = pure signal). percent_important_features (float): Fraction of features that influence each outcome. percent_missing (float): Approximate fraction of feature values to set to NaN. start_date (str): ISO date string for the first timestamp. """ def __init__( self, n_instances: int = 200, n_timepoints: int = 50, n_features: int = 100, n_outcome_vars: int = 1, feature_strength: float = 0.8, percent_important_features: float = 0.1, percent_missing: float = 0.1, start_date: str = "2022-01-01", verbose: bool = True, ): """ Initializes the SyntheticTSDataGenerator. Args: n_instances (int): Number of unique patients. n_timepoints (int): Number of daily timestamped rows per patient. n_features (int): Number of feature columns to generate. n_outcome_vars (int): Number of binary outcome columns to generate. feature_strength (float): Strength of the signal from important features. Must be between 0 and 1. percent_important_features (float): Fraction of features that should be predictive of each outcome. percent_missing (float): Approximate percentage of feature values to set to NaN. start_date (str): ISO date string for the first timestamp (e.g. ``"2022-01-01"``). verbose (bool): If True, enables logging of generation status. """ if not 0 <= feature_strength <= 1: raise ValueError("feature_strength must be between 0 and 1.")
[docs] self.n_instances = n_instances
[docs] self.n_timepoints = n_timepoints
[docs] self.n_features = n_features
[docs] self.n_outcome_vars = n_outcome_vars
[docs] self.feature_strength = feature_strength
[docs] self.percent_important_features = percent_important_features
[docs] self.percent_missing = percent_missing
[docs] self.start_date = start_date
[docs] self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO if verbose else logging.WARNING) # Reuse the same prefixes/suffixes as SyntheticDataGenerator for # consistent, realistic column names across both generators. self._feature_prefixes = [ "Alkaline Phosphatase", "RBC", "PLT", "Sodium", "Potassium", "C-reactive Protein", "Glycated Hb", "White Cell Count", "Monocytes", "MCHC.", "Calcium measurement", "Neutrophils", "Insertion - action (qualifier value)", "Routine (qualifier value)", "General treatment (procedure)", "Research fellow (occupation)", "Phlebotomy (procedure)", "Date of birth (observable entity)", "Antibiotic (product)", "Hypercholesterolemia (disorder)", "Sinus rhythm (finding)", "Capsule (basic dose form)", "Transplantation of liver (procedure)", "History of clinical finding", ] self._feature_suffixes = [ "_mean", "_median", "_mode", "_std", "_num-tests", "_days-since-last-test", "_max", "_min", "_most-recent", "_earliest-test", "_days-between-first-last", "_contains-extreme-low", "_contains-extreme-high", "_num-diagnostic-order", "_count", "_count_subject_present", "_count_subject_not_present", "_count_relative_present", "_count_relative_not_present", "_count_subject_present_mrc_cs", ] self._int_suffixes = ["_count", "_num-tests", "_num-diagnostic-order"] self._binary_suffixes = ["_contains-extreme-low", "_contains-extreme-high"] # Note: client_idcode is excluded — it is added as a structural column, # not a randomly sampled feature. self._special_features = [ "age", "male", "bmi_value", "census_ethnicity_white", "core_02_val", "bed_type_A", "vte_status_1", "hosp_site_X", "core_resus_status", "news_score", ] def _generate_column_names(self) -> List[str]: """Generates a list of realistic, structured feature names.""" self.logger.info(f"Generating {self.n_features} feature column names...") generated_names = [] num_special = min(len(self._special_features), 5) generated_names.extend(random.sample(self._special_features, num_special)) remaining = self.n_features - len(generated_names) if remaining <= 0: return generated_names[: self.n_features] clean_combinations = [ f"{prefix}{suffix}" for prefix in self._feature_prefixes for suffix in self._feature_suffixes ] random.shuffle(clean_combinations) num_from_clean = min(len(clean_combinations), remaining) generated_names.extend(clean_combinations[:num_from_clean]) remaining -= num_from_clean round_num = 1 max_rounds = 10 while remaining > 0 and round_num < max_rounds: round_candidates = [ f"{prefix}_r{round_num}{suffix}" for prefix in self._feature_prefixes for suffix in self._feature_suffixes ] random.shuffle(round_candidates) take_round = min(remaining, len(round_candidates)) generated_names.extend(round_candidates[:take_round]) remaining -= take_round round_num += 1 random.shuffle(generated_names) return generated_names[: self.n_features] def _generate_typed_data(self, feature_names: List[str], n_rows: int) -> np.ndarray: """ Generates typed feature data for ``n_rows`` rows based on column semantics. Mirrors ``SyntheticDataGenerator._generate_typed_data()`` exactly, operating on the full flattened row count (``n_instances × n_timepoints``). """ age_cols, binary_cols, bmi_cols = [], [], [] int_cols, binary_suffix_cols, normal_cols = [], [], [] for idx, col in enumerate(feature_names): if col == "age": age_cols.append(idx) elif col == "male" or "vte_status" in col or "bed_type" in col: binary_cols.append(idx) elif col == "bmi_value": bmi_cols.append(idx) elif any(s in col for s in self._int_suffixes): int_cols.append(idx) elif any(s in col for s in self._binary_suffixes): binary_suffix_cols.append(idx) else: normal_cols.append(idx) data = np.empty((n_rows, len(feature_names)), dtype=np.float32) if age_cols: for idx in age_cols: data[:, idx] = np.random.randint(20, 90, size=n_rows) if binary_cols: for idx in binary_cols: data[:, idx] = np.random.randint(0, 2, size=n_rows) if bmi_cols: for idx in bmi_cols: data[:, idx] = np.random.uniform(18, 45, size=n_rows) if int_cols: for idx in int_cols: data[:, idx] = np.random.poisson(5, size=n_rows) * random.randint(1, 5) if binary_suffix_cols: for idx in binary_suffix_cols: data[:, idx] = np.random.randint(0, 2, size=n_rows) if normal_cols: data[:, normal_cols] = np.random.randn(n_rows, len(normal_cols)).astype( np.float32 ) return data
[docs] def generate(self) -> tuple[pd.DataFrame, dict[str, list[str]]]: """ Generates and returns the synthetic longitudinal DataFrame. The output is a long-format 2D DataFrame with one row per ``(client_idcode, timestamp)`` pair — matching the structure of the real ml-grid time-series data exactly. Each patient has exactly ``n_timepoints`` consecutive daily rows. Outcome labels are generated per-row using the same signal/noise + median-threshold approach as ``SyntheticDataGenerator``. Column order: ``client_idcode | timestamp | <features> | <outcome_vars>`` Returns: tuple[pd.DataFrame, dict[str, list[str]]]: - The fully generated longitudinal dataset. - A dictionary mapping each outcome variable name to its list of important feature names used to construct it. """ from datetime import datetime, timedelta total_rows = self.n_instances * self.n_timepoints self.logger.info( f"Starting TS generation: {self.n_instances} patients × " f"{self.n_timepoints} timepoints = {total_rows} total rows, " f"{self.n_features} features" ) # 1. Generate feature column names feature_names = self._generate_column_names() # 2. Generate typed feature data for all rows at once self.logger.info("Generating typed feature data...") data = self._generate_typed_data(feature_names, total_rows) df = pd.DataFrame(data, columns=feature_names) # 3. Build client_idcode and timestamp columns. # Each patient gets n_timepoints consecutive daily timestamps. self.logger.info("Building client_idcode and timestamp columns...") start = datetime.fromisoformat(self.start_date) client_ids = np.repeat(np.arange(1, self.n_instances + 1), self.n_timepoints) timestamps = np.tile( [ (start + timedelta(days=d)).strftime("%Y-%m-%d") for d in range(self.n_timepoints) ], self.n_instances, ) # 4. Generate outcome variables — per-row, same approach as tabular version outcome_to_features_map: dict[str, list[str]] = {} new_cols_dict: dict[str, np.ndarray] = {} n_important = max(1, int(self.n_features * self.percent_important_features)) self.logger.info(f"Generating {self.n_outcome_vars} outcome variable(s)...") for i in tqdm( range(1, self.n_outcome_vars + 1), desc="Creating outcomes", disable=not self.logger.isEnabledFor(logging.INFO), ): outcome_col_name = f"outcome_var_{i}" # Randomly sample important features for this outcome important_features = ( df.columns.to_series() .sample(n=n_important, random_state=42 + i) .tolist() ) outcome_to_features_map[outcome_col_name] = important_features self.logger.info( f" For '{outcome_col_name}', selected {len(important_features)} important features" ) # Signal: weighted sum of important features across all rows signal = df[important_features].values.sum(axis=1) * self.feature_strength # Noise: scaled by (1 - feature_strength) and signal std noise_strength = 1 - self.feature_strength noise = ( np.random.randn(total_rows).astype(np.float32) * noise_strength * signal.std() ) # Threshold at median → binary outcome per row combined_signal = signal + noise threshold = np.median(combined_signal) outcome = (combined_signal > threshold).astype(np.int8) # Randomly flip 10% of labels flip_mask = np.random.rand(total_rows) < 0.1 outcome[flip_mask] = 1 - outcome[flip_mask] new_cols_dict[outcome_col_name] = outcome # 5. Introduce missing values into feature columns self.logger.info("Introducing missing values...") cols_with_nans = random.sample(feature_names, int(len(feature_names) * 0.15)) for col in tqdm( cols_with_nans, desc="Adding NaNs", disable=not self.logger.isEnabledFor(logging.INFO), ): frac = random.uniform(0.01, 0.2) n_nans = int(total_rows * frac) nan_indices = np.random.choice(total_rows, size=n_nans, replace=False) df.loc[nan_indices, col] = np.nan # 6. Assemble final DataFrame: client_idcode | timestamp | features | outcomes self.logger.info("Assembling final DataFrame...") outcome_df = pd.DataFrame(new_cols_dict, index=df.index) df.insert(0, "timestamp", timestamps) df.insert(0, "client_idcode", client_ids) df = pd.concat([df, outcome_df], axis=1) self.logger.info(f"Generation complete! Shape: {df.shape}") return df, outcome_to_features_map
[docs] def generate_synthetic_ts_data( n_instances: int = 200, n_timepoints: int = 50, n_features: int = 100, n_outcome_vars: int = 1, feature_strength: float = 0.8, percent_important_features: float = 0.1, percent_missing: float = 0.1, start_date: str = "2022-01-01", verbose: bool = True, ) -> tuple[pd.DataFrame, dict[str, list[str]]]: """ A convenience function to generate a synthetic longitudinal dataset. The returned DataFrame has one row per ``(client_idcode, timestamp)`` pair, matching the structure of real ml-grid time-series data exactly. Args: n_instances (int): Number of unique patients. n_timepoints (int): Number of daily timestamped rows per patient. n_features (int): Number of feature columns to generate. n_outcome_vars (int): Number of binary outcome columns to generate. feature_strength (float): Strength of the signal from important features. Must be between 0 and 1. percent_important_features (float): Fraction of features that should be predictive of each outcome. percent_missing (float): Approximate percentage of feature values to set to NaN. start_date (str): ISO date string for the first timestamp. verbose (bool): If True, enables logging of generation status. Returns: tuple[pd.DataFrame, dict[str, list[str]]]: - The generated longitudinal dataset. - A dictionary mapping each outcome variable to its important features. """ generator = SyntheticTSDataGenerator( n_instances=n_instances, n_timepoints=n_timepoints, n_features=n_features, n_outcome_vars=n_outcome_vars, feature_strength=feature_strength, percent_important_features=percent_important_features, percent_missing=percent_missing, start_date=start_date, verbose=verbose, ) return generator.generate()
[docs] def generate_synthetic_data( n_rows: int = 1000, n_features: int = 150, n_outcome_vars: int = 3, feature_strength: float = 0.8, percent_important_features: float = 0.1, percent_binary_features: float = 0.15, percent_int_features: float = 0.2, verbose: bool = True, ) -> tuple[pd.DataFrame, dict[str, list[str]]]: """ A convenience function to generate a synthetic dataset. This function instantiates the SyntheticDataGenerator, calls its generate method, and returns the resulting DataFrame. Args: n_rows (int): Number of rows for the synthetic dataset. n_features (int): Number of feature columns to generate. n_outcome_vars (int): Number of outcome variables to generate. feature_strength (float): Strength of the signal from important features. Must be between 0 and 1. percent_important_features (float): Percentage of features that should be predictive of the outcome. percent_binary_features (float): Percentage of features to be binary. percent_int_features (float): Percentage of features to be integer-based. verbose (bool): If True, enables logging of generation status. Returns: tuple[pd.DataFrame, dict[str, list[str]]]: - The generated synthetic dataset. - A dictionary mapping each outcome variable to its list of important features. """ generator = SyntheticDataGenerator( n_rows=n_rows, n_features=n_features, n_outcome_vars=n_outcome_vars, feature_strength=feature_strength, percent_important_features=percent_important_features, percent_binary_features=percent_binary_features, percent_int_features=percent_int_features, verbose=verbose, ) synthetic_df, feature_map = generator.generate() return synthetic_df, feature_map
if __name__ == "__main__": # Example usage: logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) # Import necessary functions for imputation and saving from ml_grid.util.impute_data_for_pipe import ( mean_impute_dataframe, save_missing_percentage, ) # --- TABULAR DATA EXAMPLE --- logging.info( "Generating a sample synthetic dataset using the importable function..." ) synthetic_df, important_feature_map = generate_synthetic_data( n_rows=500, n_features=100, n_outcome_vars=3, feature_strength=0.7, percent_important_features=0.2, verbose=True, ) logging.info("\nGenerated DataFrame info (before imputation):") logging.info(f"NaNs present: {synthetic_df.isnull().sum().sum()}") # 1. Calculate and save the percentage of missing values
[docs] missing_pickle_filename = "percent_missing_synthetic_data_generated.pkl"
logging.info( f"\nCalculating missing value percentages and saving to '{missing_pickle_filename}'..." ) save_missing_percentage(synthetic_df, output_file=missing_pickle_filename) logging.info("Missing value pickle file saved.") # 2. Perform mean imputation logging.info("\nPerforming mean imputation on the dataset...") outcome_columns = list(important_feature_map.keys()) imputed_df = mean_impute_dataframe(data=synthetic_df, y_vars=outcome_columns) logging.info( f"Imputation complete. NaNs present after imputation: {imputed_df.isnull().sum().sum()}" ) # 3. Save the imputed data to the final CSV file output_csv_filename = "synthetic_data_generated.csv" imputed_df.to_csv(output_csv_filename, index=False) logging.info(f"\nImputed data saved to '{output_csv_filename}'") # --- TIME SERIES DATA EXAMPLE --- logging.info("\n\n" + "=" * 50) logging.info("--- Generating Time-Series Data ---") logging.info("=" * 50) # 1. Generate synthetic longitudinal data ts_df, ts_important_map = generate_synthetic_ts_data( n_instances=200, n_timepoints=50, n_features=100, n_outcome_vars=1, feature_strength=0.7, percent_important_features=0.2, percent_missing=0.1, start_date="2022-01-01", verbose=True, ) logging.info(f"\nTS DataFrame shape: {ts_df.shape}") logging.info(f"Unique patients: {ts_df['client_idcode'].nunique()}") logging.info( f"Rows per patient: {ts_df.groupby('client_idcode').size().unique().tolist()}" ) logging.info(f"NaNs present: {ts_df.isnull().sum().sum()}") # 2. Calculate and save missing percentage per feature column outcome_columns_ts = list(ts_important_map.keys()) feature_cols_ts = [ c for c in ts_df.columns if c not in ("client_idcode", "timestamp") and c not in outcome_columns_ts ] ts_missing_pickle = "percent_missing_synthetic_ts_data.pkl" logging.info( f"\nCalculating missing value percentages and saving to '{ts_missing_pickle}'..." ) save_missing_percentage(ts_df[feature_cols_ts], output_file=ts_missing_pickle) logging.info("Missing value pickle file saved for TS data.") # 3. Perform mean imputation on feature columns logging.info("\nPerforming mean imputation on the dataset...") non_feature_cols_ts = ["client_idcode", "timestamp"] + outcome_columns_ts imputed_ts_df = mean_impute_dataframe(data=ts_df, y_vars=non_feature_cols_ts) logging.info( f"Imputation complete. NaNs present after imputation: {imputed_ts_df.isnull().sum().sum()}" ) # 4. Save to CSV ts_output_csv = "synthetic_ts_data_generated.csv" imputed_ts_df.to_csv(ts_output_csv, index=False) logging.info(f"\nImputed TS data saved to '{ts_output_csv}'")