Spaces:

merve
/

gradio-tgi-2

Running on CPU Upgrade

gradio-tgi-2 / app.py

reach-vb HF staff

Update app.py

63b4548 verified 3 months ago

993 Bytes

	import gradio as gr
	from huggingface_hub import InferenceClient
	import os

	token = os.getenv("TOKEN")
	endpoint = os.getenv("ENDPOINT")

	# initialize InferenceClient
	client = InferenceClient(model="https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct", token=token)

	# query client using streaming mode
	def inference(message, history):
	partial_message = ""
	for token in client.text_generation(message, max_new_tokens=100, stream=True):
	partial_message += token
	yield partial_message

	gr.ChatInterface(
	inference,
	chatbot=gr.Chatbot(height=300),
	textbox=gr.Textbox(placeholder="Chat with me!", container=False, scale=7),
	title="Gradio 🤝 TGI",
	description="This is the demo for Gradio UI consuming TGI endpoint with LLaMA 7B-Chat model.",
	theme="abidlabs/Lime",
	examples=["Are tomatoes vegetables?"],
	cache_examples=True,
	retry_btn="Retry",
	undo_btn="Undo",
	clear_btn="Clear",
	).queue().launch()