Spaces:

unifyai
/

llmrouter

Runtime error

App Files Files Community

llmrouter / app.py

ivy-exploration

Update chat interface link

50951b2 verified 11 months ago

raw

history blame contribute delete

5.68 kB

	from openai import OpenAI
	import streamlit as st
	import numpy as np
	from PIL import Image
	from time import perf_counter
	import itertools

	# Page Configuration
	st.set_page_config(
	page_title= "Unify Router Demo",
	page_icon="./assets/unify_spiral.png",
	layout = "wide",
	initial_sidebar_state="collapsed"
	)
	router_avatar = np.array(Image.open('./assets/unify_spiral.png'))

	# Custom font
	with open( "./style.css" ) as css:
	st.markdown( f'<style>{css.read()}</style>' , unsafe_allow_html= True)

	# Info message
	st.info(
	body="This demo is only a preview. Check out our [Chat UI](https://unify.ai/chat) for the full experience, including more endpoints, and extra customization!",
	icon="ℹ️"
	)

	# Parameter choices
	strategies = {
	'🏃 fastest': "tks-per-sec",
	'⌛ most responsive': "ttft",
	"💵 cheapest": "input-cost",
	}
	models = {
	'🦙 Llama2 70B Chat': "llama-2-70b-chat",
	'💨 Mixtral 8x7B Instruct': "mixtral-8x7b-instruct-v0.1",
	'💎 Gemma 7B': "gemma-7b-it",
	}

	# Body
	Parameters_Col, Chat_Col = st.columns([1,3])

	with Parameters_Col:

	st.image(
	"./assets/unify_logo.png",
	use_column_width="auto",
	)
	st.markdown("Send your prompts to the best LLM endpoint and optimize performance, all with a single API")

	strategy = st.selectbox(
	label = 'I want the',
	options = tuple(strategies.keys()),
	help="Choose the metric to optimize the routing for. \
	Fastest picks the endpoint with the highest output tokens per seconds. \
	Most responsive picks the endpoint with the smallest time to complete the request. \
	Cheapest picks the endpoint with the lowest output tokens cost",
	)
	model = st.selectbox(
	label = 'endpoint for',
	options = tuple(models.keys()),
	help="Select a model to optimize for. The same model can be offered by different model endpoint providers. The router lets you find the optimal endpoint for your chosen model, target metric, and input prompt",
	)
	with st.expander("Advanced Inputs"):
	max_tokens = st.slider(
	label = "Maximum Number Of Tokens",
	min_value=100,
	max_value=2000,
	value=500,
	step=100,
	help = "The maximum number of tokens that can be generated."
	)
	temperature = st.slider(
	label = "Temperature",
	min_value=0.0,
	max_value=1.,
	value=0.5,
	step=0.5,
	help = "The model's output randomness. Higher values give more random outputs."
	)

	with Chat_Col:

	# Initializing empty chat space and messages state
	if "messages" not in st.session_state:
	st.session_state.messages = []
	msgs = st.container(height = 350)

	# Writing conversation history
	for msg in st.session_state.messages:
	if msg["role"] == "user":
	msgs.chat_message(msg["role"]).write(msg["content"])
	else:
	msgs.chat_message(msg["role"], avatar=router_avatar).write(msg["content"])

	# Preparing client
	client = OpenAI(
	base_url="https://api.unify.ai/v0/",
	api_key=st.secrets["UNIFY_API"]
	)

	# Processing prompt box input
	if prompt := st.chat_input("Enter your prompt.."):

	# Displaying user prompt and saving in message states
	st.session_state.messages.append({"role": "user", "content": prompt})
	with msgs.chat_message("user"):
	st.write(prompt)

	# Displaying output, metrics, and saving output in message states
	with msgs.status("Routing your prompt..",expanded=True):
	# Sending prompt to model endpoint
	start = perf_counter()
	stream = client.chat.completions.create(
	model="@".join([
	models[model],
	strategies[strategy]
	]),
	messages=[
	{"role": m["role"], "content": m["content"]}
	for m in st.session_state.messages
	],
	stream=True,
	max_tokens=max_tokens,
	temperature=temperature
	)
	time_to_completion = round(perf_counter() - start, 2)

	# Writing answer progressively
	stream, stream_copy = itertools.tee(stream)
	st.write_stream(stream)
	chunks = [chunk for chunk in stream_copy]

	# Computing metrics
	last_chunk = chunks[-1]
	cost = round(last_chunk.usage["cost"],6)
	output_tokens = last_chunk.usage["completion_tokens"]
	tokens_per_second = round(output_tokens / time_to_completion, 2)

	# Displaying model, provider, and metrics
	provider = " ".join(chunks[0].model.split("@")[-1].split("-")).title()
	if " Ai" in provider:
	provider = provider.replace("Ai", "AI")
	st.markdown(f"Model: {model}. Provider: {provider}")
	st.markdown(
	f"{tokens_per_second} Tokens Per Second - \
	{time_to_completion} Seconds to complete - \
	{cost:.6f} $"
	)

	# Saving output to message states
	output_chunks = [chunk.choices[0].delta.content or "" for chunk in chunks]
	response = ''.join(output_chunks)
	st.session_state.messages.append({"role": "assistant", "content": response})

	# Cancel / Stop button
	if st.button("Clear Chat", key="clear"):
	msgs.empty()
	st.session_state.messages = []