Spaces:

Sunbird
/

acres

Sleeping

File size: 8,234 Bytes

# utils/helpers.py

import json
import os
from typing import Any, Dict, List

import chromadb
from chromadb.api.types import Document
from llama_index.core import Response

from rag.rag_pipeline import RAGPipeline
from utils.prompts import (
    StudyCharacteristics,
    VaccineCoverageVariables,
    structured_follow_up_prompt,
)

# Initialize ChromaDB client
chromadb_client = chromadb.Client()


def read_study_files(file_path):
    """
    Reads a JSON file and returns the parsed JSON data.

    Args:
        file_path (str): The path to the JSON file to be read.

    Returns:
        dict: The data from the JSON file as a Python dictionary.

    Raises:
        FileNotFoundError: If the file is not found at the provided path.
        json.JSONDecodeError: If the file contents are not valid JSON.

    Example:
        Given a JSON file 'study_files.json' with content like:
        {
            "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
            "Ebola Virus": "data/ebola_virus_zotero_items.json",
            "Gene Xpert": "data/gene_xpert_zotero_items.json"
        }

        Calling `read_json_file("study_files.json")` will return:
        {
            "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
            "Ebola Virus": "data/ebola_virus_zotero_items.json",
            "Gene Xpert": "data/gene_xpert_zotero_items.json"
        }
    """
    try:
        with open(file_path, "r") as file:
            data = json.load(file)
        return data
    except FileNotFoundError as e:
        raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
    except json.JSONDecodeError as e:
        raise ValueError(
            f"The file at path {file_path} does not contain valid JSON."
        ) from e


def append_to_study_files(file_path, new_key, new_value):
    """
    Appends a new key-value entry to an existing JSON file.

    Args:
        file_path (str): The path to the JSON file.
        new_key (str): The new key to add to the JSON file.
        new_value (any): The value associated with the new key (can be any valid JSON data type).

    Raises:
        FileNotFoundError: If the file is not found at the provided path.
        json.JSONDecodeError: If the file contents are not valid JSON.
        IOError: If the file cannot be written.

    Example:
        If the file 'study_files.json' initially contains:
        {
            "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
            "Ebola Virus": "data/ebola_virus_zotero_items.json"
        }

        Calling `append_to_json_file("study_files.json", "Gene Xpert", "data/gene_xpert_zotero_items.json")`
        will modify the file to:
        {
            "Vaccine Coverage": "data/vaccine_coverage_zotero_items.json",
            "Ebola Virus": "data/ebola_virus_zotero_items.json",
            "Gene Xpert": "data/gene_xpert_zotero_items.json"
        }
    """
    try:
        # Read the existing data from the file
        with open(file_path, "r") as file:
            data = json.load(file)

        # Append the new key-value pair to the dictionary
        data[new_key] = new_value

        # Write the updated data back to the file
        with open(file_path, "w") as file:
            json.dump(data, file, indent=4)  # indent for pretty printing

    except FileNotFoundError as e:
        raise FileNotFoundError(f"The file at path {file_path} was not found.") from e
    except json.JSONDecodeError as e:
        raise ValueError(
            f"The file at path {file_path} does not contain valid JSON."
        ) from e
    except IOError as e:
        raise IOError(f"Failed to write to the file at {file_path}.") from e


def generate_follow_up_questions(
    rag: RAGPipeline, response: str, query: str, study_name: str
) -> List[str]:
    """
    Generates follow-up questions based on the given RAGPipeline, response, query, and study_name.
    Args:
        rag (RAGPipeline): The RAGPipeline object used for generating follow-up questions.
        response (str): The response to the initial query.
        query (str): The initial query.
        study_name (str): The name of the study.
    Returns:
        List[str]: A list of generated follow-up questions.
    Raises:
        None
    """

    # Determine the study type based on the study_name
    if "Vaccine Coverage" in study_name:
        study_type = "Vaccine Coverage"
        key_variables = list(VaccineCoverageVariables.__annotations__.keys())
    elif "Ebola Virus" in study_name:
        study_type = "Ebola Virus"
        key_variables = [
            "SAMPLE_SIZE",
            "PLASMA_TYPE",
            "DOSAGE",
            "FREQUENCY",
            "SIDE_EFFECTS",
            "VIRAL_LOAD_CHANGE",
            "SURVIVAL_RATE",
        ]
    elif "Gene Xpert" in study_name:
        study_type = "Gene Xpert"
        key_variables = [
            "OBJECTIVE",
            "OUTCOME_MEASURES",
            "SENSITIVITY",
            "SPECIFICITY",
            "COST_COMPARISON",
            "TURNAROUND_TIME",
        ]
    else:
        study_type = "General"
        key_variables = list(StudyCharacteristics.__annotations__.keys())

    # Add key variables to the context
    context = f"Study type: {study_type}\nKey variables to consider: {', '.join(key_variables)}\n\n{response}"

    follow_up_response = rag.query(
        structured_follow_up_prompt.format(
            context_str=context,
            query_str=query,
            response_str=response,
            study_type=study_type,
        )
    )

    questions = follow_up_response.response.strip().split("\n")
    cleaned_questions = []
    for q in questions:
        # Remove leading numbers and periods, and strip whitespace
        cleaned_q = q.split(". ", 1)[-1].strip()
        # Ensure the question ends with a question mark
        if cleaned_q and not cleaned_q.endswith("?"):
            cleaned_q += "?"
        if cleaned_q:
            cleaned_questions.append(f"✨ {cleaned_q}")
    return cleaned_questions[:3]


def add_study_files_to_chromadb(file_path: str, collection_name: str):
    """
    Reads the study files data from a JSON file and adds it to the specified ChromaDB collection.

    :param file_path: Path to the JSON file containing study files data.
    :param collection_name: Name of the ChromaDB collection to store the data.
    """
    # Load study files data from JSON file
    try:
        with open(file_path, "r") as f:
            study_files_data = json.load(f)
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return

    if not study_files_data:
        return

    # Get or create the collection in ChromaDB
    collection = chromadb_client.get_or_create_collection(collection_name)

    # Prepare lists for ids, texts, and metadata to batch insert
    ids = []
    documents = []
    metadatas = []

    # Populate lists with data from the JSON file
    for name, file_path in study_files_data.items():
        ids.append(name)  # Document ID
        documents.append("")  # Optional text, can be left empty if not used
        metadatas.append({"file_path": file_path})  # Metadata with file path

    # Add documents to the collection in batch
    collection.add(ids=ids, documents=documents, metadatas=metadatas)

    print("All study files have been successfully added to ChromaDB.")


def create_directory(directory_path):
    """
    Create a directory.
    Does not raise an error if the directory already exists.

    Args:
        directory_path (str): Path of the directory to create

    Returns:
        bool: True if directory was created or already exists, False if creation failed
    """
    try:
        # Use exist_ok=True to prevent error if directory exists
        os.makedirs(directory_path, exist_ok=True)
        return True
    except PermissionError:
        print(f"Permission denied: Cannot create directory {directory_path}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return False


if __name__ == "__main__":
    # Usage example
    add_study_files_to_chromadb("study_files.json", "study_files_collection")