Source code for pat2vec.pat2vec_get_methods.get_method_news

import os
from typing import Callable, Dict, Optional, Tuple, List

import numpy as np
import pandas as pd
from IPython.display import display

from pat2vec.util.filter_dataframe_by_timestamp import filter_dataframe_by_timestamp
from pat2vec.util.get_start_end_year_month import get_start_end_year_month
from pat2vec.util.parse_date import validate_input_dates


[docs] def compute_feature_stats( data: pd.DataFrame, column: str, feature_name: str, config_obj: object ) -> Dict: """Computes summary statistics for a feature column in the NEWS dataset. Args: data (pd.DataFrame): Subset of patient data for the feature. column (str): Column to compute stats from (e.g., 'observation_valuetext_analysed'). feature_name (str): Base name for the output feature columns. config_obj (object): Configuration object with `negate_biochem` attribute. Returns: Dict: A dictionary of calculated feature statistics (mean, median, std, max, min, n). """ stats = {} if len(data) > 0: values = pd.to_numeric(data[column], errors="coerce").dropna() if len(values) > 0: stats[f"{feature_name}_mean"] = values.mean() stats[f"{feature_name}_median"] = values.median() stats[f"{feature_name}_std"] = values.std() stats[f"{feature_name}_max"] = values.max() stats[f"{feature_name}_min"] = values.min() stats[f"{feature_name}_n"] = values.shape[0] return stats if config_obj.negate_biochem: for suffix in ["mean", "median", "std", "max", "min", "n"]: stats[f"{feature_name}_{suffix}"] = np.nan return stats
[docs] def search_news_observations( cohort_searcher_with_terms_and_search=None, client_id_codes=None, observations_time_field="observationdocument_recordeddtm", fields_override: Optional[List[str]] = None, start_year="1995", start_month="01", start_day="01", end_year="2025", end_month="12", end_day="12", additional_custom_search_string=None, index_name: str = "observations", output_filename: Optional[str] = "news_search_results.csv", overwrite: bool = False, config_obj: Optional[object] = None, ): """Searches for NEWS/NEWS2 observation data within a date range. Args: output_filename (Optional[str]): The filename or path to a CSV file to load from or save to. Defaults to "news_search_results.csv". overwrite (bool): If True, perform the search even if `output_filename` exists. Defaults to False. config_obj (Optional[object]): Configuration object containing root_path. Defaults to None. """ if ( output_filename and config_obj and hasattr(config_obj, "root_path") and hasattr(config_obj, "proj_name") ): output_filename = os.path.join( config_obj.root_path, config_obj.proj_name, output_filename ) if output_filename and os.path.exists(output_filename) and not overwrite: print(f"Loading existing news data from {output_filename}") return pd.read_csv(output_filename) if cohort_searcher_with_terms_and_search is None: raise ValueError("cohort_searcher_with_terms_and_search cannot be None.") if client_id_codes is None: raise ValueError("client_id_codes cannot be None.") # Ensure client_id_codes is a list for the search function if isinstance(client_id_codes, str): client_id_codes = [client_id_codes] start_year, start_month, start_day, end_year, end_month, end_day = ( validate_input_dates( start_year, start_month, start_day, end_year, end_month, end_day ) ) search_string = ( "obscatalogmasteritem_displayname:(NEWS*) AND " f"{observations_time_field}:[{start_year}-{start_month}-{start_day} " f"TO {end_year}-{end_month}-{end_day}]" ) if additional_custom_search_string: search_string += f" {additional_custom_search_string}" fields_to_use = [ "observation_guid", "client_idcode", "obscatalogmasteritem_displayname", "observation_valuetext_analysed", "observationdocument_recordeddtm", "clientvisit_visitidcode", ] if fields_override: fields_to_use = fields_override client_idcode_term_name = "client_idcode.keyword" if config_obj and hasattr(config_obj, "client_idcode_term_name"): client_idcode_term_name = config_obj.client_idcode_term_name results = cohort_searcher_with_terms_and_search( index_name=index_name, fields_list=fields_to_use, term_name=client_idcode_term_name, entered_list=client_id_codes, search_string=search_string, ) if output_filename: if os.path.dirname(output_filename): os.makedirs(os.path.dirname(output_filename), exist_ok=True) print(f"Saving news data to {output_filename}") results.to_csv(output_filename, index=False) return results
[docs] def get_news( current_pat_client_id_code: str, target_date_range: Tuple, pat_batch: pd.DataFrame, config_obj: Optional[object] = None, cohort_searcher_with_terms_and_search: Optional[Callable] = None, fields_override: Optional[List[str]] = None, ) -> pd.DataFrame: """Retrieves NEWS/NEWS2 features for a patient within a date range. This function fetches NEWS (National Early Warning Score) observation data, either from a pre-loaded batch or by searching. It then calculates summary statistics (mean, median, std, etc.) for each component of the NEWS score. Args: current_pat_client_id_code (str): The client ID code of the patient. target_date_range (Tuple): A tuple representing the target date range. pat_batch (pd.DataFrame): The DataFrame containing patient data for batch mode. config_obj (Optional[object]): Configuration object with settings like `batch_mode` and `client_idcode_term_name`. Defaults to None. cohort_searcher_with_terms_and_search (Optional[Callable]): The function for cohort searching. Defaults to None. fields_override (Optional[List[str]]): A list of fields to override the default search fields. Defaults to None. Returns: pd.DataFrame: A DataFrame containing NEWS features for the specified patient. """ start_year, start_month, end_year, end_month, start_day, end_day = ( get_start_end_year_month(target_date_range, config_obj=config_obj) ) if pat_batch.empty: return pd.DataFrame({"client_idcode": [current_pat_client_id_code]}) if config_obj.batch_mode: current_pat_raw_news = filter_dataframe_by_timestamp( pat_batch, start_year, start_month, end_year, end_month, start_day, end_day, "observationdocument_recordeddtm", ) else: current_pat_raw_news = search_news_observations( cohort_searcher_with_terms_and_search=cohort_searcher_with_terms_and_search, client_id_codes=current_pat_client_id_code, observations_time_field="observationdocument_recordeddtm", start_year=start_year, start_month=start_month, start_day=start_day, end_year=end_year, end_month=end_month, end_day=end_day, fields_override=fields_override, output_filename=None, config_obj=config_obj, ) # Always start with client_idcode news_features = {"client_idcode": current_pat_client_id_code} # Define mappings between display names and feature names feature_map = { "NEWS2_Score": "news_score", "NEWS_Systolic_BP": "news_systolic_bp", "NEWS_Diastolic_BP": "news_diastolic_bp", "NEWS_Respiration_Rate": "news_respiration_rate", "NEWS_Heart_Rate": "news_heart_rate", "NEWS_Oxygen_Saturation": "news_oxygen_saturation", "NEWS Temperature": "news_temperature", "NEWS_AVPU": "news_avpu", "NEWS_Supplemental_Oxygen": "news_supplemental_oxygen", "NEWS2_Sp02_Target": "news_sp02_target", "NEWS2_Sp02_Scale": "news_sp02_scale", "NEWS_Pulse_Type": "news_pulse_type", "NEWS_Pain_Score": "news_pain_score", "NEWS Oxygen Litres": "news_oxygen_litres", "NEWS Oxygen Delivery": "news_oxygen_delivery", } for display_name, feature_name in feature_map.items(): subset = current_pat_raw_news[ current_pat_raw_news["obscatalogmasteritem_displayname"] == display_name ].copy() subset.dropna(subset=["observation_valuetext_analysed"], inplace=True) # special case: cap NEWS2 score at [-20, 20] if feature_name == "news_score" and len(subset) > 0: numeric_values = pd.to_numeric( subset["observation_valuetext_analysed"], errors="coerce" ) subset = subset[(numeric_values < 20) & (numeric_values > -20)].copy() stats = compute_feature_stats( subset, "observation_valuetext_analysed", feature_name, config_obj ) news_features.update(stats) news_features_df = pd.DataFrame([news_features]) if config_obj.verbosity >= 6: display(news_features_df) return news_features_df