Source code for pat2vec.util.methods_annotation_multi_annots_to_df

from pat2vec.util.methods_annotation_json_to_dataframe import json_to_dataframe
import logging
from pat2vec.util.methods_get import update_pbar
from pat2vec.util.post_processing import (
    join_icd10_OPC4S_codes_to_annot,
    join_icd10_codes_to_annot,
)

import pandas as pd
import os
import shutil
import tempfile
from contextlib import contextmanager
from typing import Any, Dict, Iterator, List
from IPython.display import display

logger = logging.getLogger(__name__)


[docs] @contextmanager def temporary_file(suffix: str = ".csv", delete: bool = True) -> Iterator[str]: """Context manager for creating and cleaning up temporary files. Args: suffix: The file suffix for the temporary file. delete: If True, the file is deleted upon exiting the context. Yields: The path to the temporary file. """ temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) temp_file.close() try: yield temp_file.name finally: if delete and os.path.exists(temp_file.name): os.remove(temp_file.name)
[docs] def multi_annots_to_df( current_pat_client_idcode: str, pat_batch: pd.DataFrame, multi_annots: List[Dict[str, Any]], config_obj: Any, t: Any, text_column: str = "body_analysed", time_column: str = "updatetime", guid_column: str = "document_guid", ) -> pd.DataFrame: """Processes MedCAT annotations for a batch of documents, creating and saving a DataFrame. This function takes a list of MedCAT annotation results, corresponding to a batch of documents for a single patient. It iterates through each document's annotations, converts them from JSON-like dictionary format into a structured pandas DataFrame using `json_to_dataframe`, and concatenates them into a single master DataFrame for the patient. The function can optionally enrich the annotation data by joining it with ICD-10 and OPCS-4 codes based on settings in the configuration object. Finally, the resulting DataFrame is saved as a CSV file in the patient's designated annotation directory. Args: current_pat_client_idcode: The unique identifier for the patient. pat_batch: A DataFrame where each row represents a document in the patient's batch. multi_annots: A list of dictionaries, where each dictionary contains the MedCAT annotation entities for a corresponding document in `pat_batch`. config_obj: A configuration object containing settings such as file paths (`pre_document_annotation_batch_path`), verbosity level, and flags for `add_icd10` and `add_opc4s`. Defaults to None. t: A tqdm progress bar object for providing real-time feedback. text_column: The name of the column in `pat_batch` that contains the document text to be annotated. Defaults to 'body_analysed'. time_column: The name of the column in `pat_batch` that holds the timestamp for each document. Defaults to 'updatetime'. guid_column: The name of the column in `pat_batch` that contains the unique identifier for each document. Defaults to 'document_guid'. Returns: A consolidated DataFrame containing all annotations for the patient's document batch. An empty DataFrame is returned if no valid annotations are processed. Raises: ValueError: If `config_obj` is not provided. """ if config_obj is None: raise ValueError("config_obj is required") processed_dfs = [] for i in range(len(pat_batch)): try: doc_to_annot_df = json_to_dataframe( json_data=multi_annots[i], doc=pat_batch.iloc[i], current_pat_client_id_code=current_pat_client_idcode, text_column=text_column, time_column=time_column, guid_column=guid_column, ) if not doc_to_annot_df.empty: doc_to_annot_df.dropna( subset=["client_idcode", time_column], inplace=True ) if not doc_to_annot_df.empty: processed_dfs.append(doc_to_annot_df) except Exception as e: if config_obj.verbosity >= 1: logger.warning(f"Error processing document {i}: {str(e)}") continue if processed_dfs: final_df = pd.concat(processed_dfs, ignore_index=True) else: # If no data, create an empty DataFrame with the correct columns col_list = [ "client_idcode", time_column, "pretty_name", "cui", "type_ids", "types", "source_value", "detected_name", "acc", "context_similarity", "start", "end", "icd10", "ontologies", "snomed", "id", "Time_Value", "Time_Confidence", "Presence_Value", "Presence_Confidence", "Subject_Value", "Subject_Confidence", "text_sample", "full_doc", guid_column, ] final_df = pd.DataFrame(columns=col_list) # Handle ICD10 and OPC4S code joining try: if not final_df.empty: # Only join if there is data if config_obj.add_icd10 and config_obj.add_opc4s: final_df = join_icd10_OPC4S_codes_to_annot(df=final_df, inner=False) elif config_obj.add_icd10: final_df = join_icd10_codes_to_annot(df=final_df, inner=False) except Exception as e: if config_obj.verbosity >= 1: logger.warning(f"Error joining ICD10/OPC4S codes: {str(e)}") # Write the final DataFrame to CSV. This will now run every time. destination_path = os.path.join( config_obj.pre_document_annotation_batch_path, current_pat_client_idcode + ".csv", ) final_df.to_csv(destination_path, index=False) return final_df