import gradio as gr from huggingface_hub import InferenceClient client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") def respond(message, history): messages = [{"role": "system", "content": "You are a friendly chatbot."}] if history: messages.extend(history) messages.append({"role": "user", "content": message}) # Initialize the response as an empty string response = "" # Stream the response using yield and for loop for message in client.chat_completion( messages, max_tokens=150, temperature=0.7, top_p=0.9, stream=True # Enable streaming ): # Capture the most recent token token = message['choices'][0]['delta']['content'] # Add the token to the response response += token # Yield the response to stream it progressively yield response chatbot = gr.ChatInterface( respond, type="messages", title="Streaming Chatbot", description="This chatbot streams responses as they are generated for a more dynamic experience!" ) chatbot.launch()