Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
from huggingface_hub import InferenceClient | |
import os | |
token = os.getenv("TOKEN") | |
endpoint = os.getenv("ENDPOINT") | |
# initialize InferenceClient | |
client = InferenceClient(model="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct", token=token) | |
# query client using streaming mode | |
def inference(message, history): | |
partial_message = "" | |
for token in client.text_generation(message, max_new_tokens=100, stream=True): | |
partial_message += token | |
yield partial_message | |
gr.ChatInterface( | |
inference, | |
chatbot=gr.Chatbot(height=300), | |
textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7), | |
title="Gradio 🤝 TGI", | |
description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.", | |
theme="abidlabs/Lime", | |
examples=["Are tomatoes vegetables?"], | |
cache_examples=True, | |
retry_btn="Retry", | |
undo_btn="Undo", | |
clear_btn="Clear", | |
).queue().launch() |