Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 993 Bytes
dea517c 7898ec7 dea517c b451ff3 dea517c a1dd72a dea517c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import gradio as gr
from huggingface_hub import InferenceClient
import os
token = os.getenv("TOKEN")
endpoint = os.getenv("ENDPOINT")
# initialize InferenceClient
client = InferenceClient(model="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)
# query client using streaming mode
def inference(message, history):
partial_message = ""
for token in client.text_generation(message, max_new_tokens=100, stream=True):
partial_message += token
yield partial_message
gr.ChatInterface(
inference,
chatbot=gr.Chatbot(height=300),
textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
title="Gradio 🤝 TGI",
description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
theme="abidlabs/Lime",
examples=["Are tomatoes vegetables?"],
cache_examples=True,
retry_btn="Retry",
undo_btn="Undo",
clear_btn="Clear",
).queue().launch() |