Source code for ml_grid.pipeline.data_percent_missing

import os
import pickle
from typing import Any, Dict, List


[docs] def handle_percent_missing( local_param_dict: Dict[str, Any], all_df_columns: List[str], file_name: str, drop_list: List[str], ) -> List[str]: """Handles the removal of columns with a high percentage of missing data. This function reads a pre-calculated dictionary of missing data percentages from a pickle file. It then identifies columns that exceed the missing data threshold specified in `local_param_dict` and adds them to the `drop_list`. Args: local_param_dict (Dict[str, Any]): Dictionary of parameters for the current pipeline, must contain a 'percent_missing' threshold. all_df_columns (List[str]): All column names in the DataFrame to be processed. file_name (str): The base name of the input data file, used to find the corresponding percent missing pickle file. drop_list (List[str]): List of columns to be dropped from the DataFrame. Returns: List[str]: Updated list of columns to be dropped from the dataframe. """ # Check for null pointer references assert local_param_dict is not None assert all_df_columns is not None assert drop_list is not None percent_missing_drop_list = [] filename = file_name.replace(".csv", "") # Check if the file with .pkl extension exists, otherwise use .pickle if os.path.exists(f"{filename}_percent_missing.pkl"): percent_missing_filename = f"{filename}_percent_missing.pkl" else: percent_missing_filename = f"{filename}_percent_missing.pickle" # Check if the file exists if os.path.exists(percent_missing_filename): with open(percent_missing_filename, "rb") as handle: try: percent_missing_dict = pickle.load(handle) except Exception as e: print(f"Error loading pickle file: {e}") percent_missing_dict = {} else: print(f"File {percent_missing_filename} not found. Returning empty dict.") percent_missing_dict = {} percent_missing_threshold = local_param_dict.get("percent_missing") if percent_missing_threshold is not None and percent_missing_dict is not {}: # print( # f"Identifying columns with > {percent_missing_threshold} percent missing data..." # ) # Iterate through columns for col in all_df_columns: # Try to get the value from the dictionary try: if ( col in percent_missing_dict and percent_missing_dict.get(col) > percent_missing_threshold ): percent_missing_drop_list.append(col) except Exception as e: print(f"Error processing column {col}: {e}") pass print( f"Identified {len(percent_missing_drop_list)} columns with > {percent_missing_threshold} percent missing data." ) # Extend the drop list with identified columns drop_list.extend(percent_missing_drop_list) else: print( "percent_missing_threshold is None or percent_missing_dict is empty. Skipping percent missing data check." ) return drop_list