import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import threading # Title and description TITLE = "AI Copilot for Patients" DESCRIPTION = "I provide answers to concerns related to Health" # Globals llm_llama_cpp = None model_ready = False # Download and initialize model in background def load_model(): global llm_llama_cpp, model_ready try: print("Downloading model...") model_file_path = hf_hub_download( repo_id="TheBloke/Llama-2-7B-GGUF", filename="llama-2-7b.Q4_0.gguf" ) print("Initializing model...") llm_llama_cpp = Llama( model_path=model_file_path, verbose=False, n_ctx=4096 ) model_ready = True print("Model is ready.") except Exception as e: print(f"Failed to load model: {e}") # Background thread for model loading threading.Thread(target=load_model).start() # Chatbot logic def talk(prompt, history): if not model_ready: return "⏳ Please wait, the model is still loading..." try: response = "" response_stream = llm_llama_cpp.create_completion( prompt=prompt, max_tokens=200, stream=True ) for chunk in response_stream: if 'choices' in chunk and 'text' in chunk['choices'][0]: response += chunk['choices'][0]['text'] return response except Exception as e: print(f"Error in generating response: {e}") return f"Error during response generation: {e}" # Gradio interface demo = gr.ChatInterface( fn=talk, chatbot=gr.Chatbot( show_label=True, show_share_button=True, show_copy_button=True, layout="bubble", type="messages", ), theme="Soft", examples=[["what is Diabetes?"]], title=TITLE, description=DESCRIPTION, ) # Launch the UI demo.launch(share=True)