import gradio as gr from huggingface_hub import InferenceClient import os token = os.getenv("TOKEN") endpoint = os.getenv("ENDPOINT") # initialize InferenceClient client = InferenceClient(model="https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf", token=token) # query client using streaming mode def inference(message, history): partial_message = "" for token in client.text_generation(message, max_new_tokens=20, stream=True): partial_message += token yield partial_message gr.ChatInterface( inference, chatbot=gr.Chatbot(height=300), textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7), title="Gradio 🤝 TGI", description="This is the demo for Gradio UI consuming TGI endpoint with Falcon model.", theme="abidlabs/Lime", examples=["Are tomatoes vegetables?"], cache_examples=True, retry_btn="Retry", undo_btn="Undo", clear_btn="Clear", ).queue().launch()