|
import gradio as gr |
|
from openai import AsyncOpenAI |
|
|
|
base_url = "http://127.0.0.1:8080/v1" |
|
client = AsyncOpenAI(base_url=base_url, api_key="-") |
|
|
|
""" |
|
frequency_penalty: Optional[float] = None, |
|
logit_bias: Optional[List[float]] = None, |
|
logprobs: Optional[bool] = None, |
|
top_logprobs: Optional[int] = None, |
|
max_tokens: Optional[int] = None, |
|
n: Optional[int] = None, |
|
presence_penalty: Optional[float] = None, |
|
stream: bool = False, |
|
seed: Optional[int] = None, |
|
temperature: Optional[float] = None, |
|
top_p: Optional[float] = None, |
|
tools: Optional[List[Tool]] = None, |
|
tool_choice: Optional[str] = None, |
|
""" |
|
|
|
def _default_parameters(): |
|
return { |
|
"max_tokens": 256, |
|
"stream": True, |
|
"temperature": 0.9, |
|
} |
|
|
|
def _translate_messages(history): |
|
messages = [] |
|
|
|
for conv in history: |
|
messages.append({"role":"user", "content":conv[0]}) |
|
messages.append({"role":"assistant", "content":conv[1]}) |
|
|
|
return messages |
|
|
|
async def echo(message, history): |
|
parameters = _default_parameters() |
|
messages = _translate_messages(history) |
|
messages.append({"role":"user", "content":message}) |
|
|
|
responses = await client.chat.completions.create( |
|
model="tgi", messages=messages, **parameters |
|
) |
|
|
|
full_resp = "" |
|
async for resp in responses: |
|
full_resp = full_resp + resp.choices[0].delta.content |
|
yield full_resp |
|
|
|
demo = gr.ChatInterface( |
|
fn=echo, |
|
examples=["hello", "how are you?", "What is Large Language Model?"], |
|
title="Space of Gradio ➕ Text Generation Inference", |
|
multimodal=False |
|
) |
|
|
|
demo.queue().launch(server_name="0.0.0.0", server_port=3000) |