import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import threading

# Title and description
TITLE = "AI Copilot for Patients"
DESCRIPTION = "I provide answers to concerns related to Health"

# Globals
llm_llama_cpp = None
model_ready = False

# Download and initialize model in background
def load_model():
    global llm_llama_cpp, model_ready
    try:
        print("Downloading model...")
        model_file_path = hf_hub_download(
            repo_id="TheBloke/Llama-2-7B-GGUF", 
            filename="llama-2-7b.Q4_0.gguf"
        )

        print("Initializing model...")
        llm_llama_cpp = Llama(
            model_path=model_file_path,
            verbose=False,
            n_ctx=4096
        )
        model_ready = True
        print("Model is ready.")
    except Exception as e:
        print(f"Failed to load model: {e}")

# Background thread for model loading
threading.Thread(target=load_model).start()

# Chatbot logic
def talk(prompt, history):
    if not model_ready:
        return "⏳ Please wait, the model is still loading..."
    
    try:
        response = ""
        response_stream = llm_llama_cpp.create_completion(
            prompt=prompt,
            max_tokens=200,
            stream=True
        )

        for chunk in response_stream:
            if 'choices' in chunk and 'text' in chunk['choices'][0]:
                response += chunk['choices'][0]['text']
        return response

    except Exception as e:
        print(f"Error in generating response: {e}")
        return f"Error during response generation: {e}"

# Gradio interface
demo = gr.ChatInterface(
    fn=talk,
    chatbot=gr.Chatbot(
        show_label=True,
        show_share_button=True,
        show_copy_button=True,
        layout="bubble",
        type="messages",
    ),
    theme="Soft",
    examples=[["what is Diabetes?"]],
    title=TITLE,
    description=DESCRIPTION,
)

# Launch the UI
demo.launch(share=True)