rombodawg's picture
Update app.py
6cbaa62 verified
raw
history blame contribute delete
No virus
2.18 kB
import gradio as gr
from transformers import TextIteratorStreamer
from threading import Thread
from transformers import StoppingCriteria, StoppingCriteriaList
import torch
import spaces
import os
model_name = "microsoft/Phi-3-medium-128k-instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda', torch_dtype=torch.float16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
stop_ids = [29, 0]
for stop_id in stop_ids:
if input_ids[0][-1] == stop_id:
return True
return False
model.to('cuda')
@spaces.GPU(duration=300)
def predict(message, history, temperature, max_tokens, top_p, top_k):
history_transformer_format = history + [[message, ""]]
stop = StopOnTokens()
messages = "".join(["".join(["\n<|end|>\n<|user|>\n"+item[0], "\n<|end|>\n<|assistant|>\n"+item[1]]) for item in history_transformer_format])
model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(tokenizer, timeout=300., skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
stopping_criteria=StoppingCriteriaList([stop])
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_message = ""
for new_token in streamer:
if new_token != '<':
partial_message += new_token
yield partial_message
demo = gr.ChatInterface(
fn=predict,
title="Phi-3-medium-128k-instruct",
additional_inputs=[
gr.Slider(0.1, 0.9, value=0.7, label="Temperature"),
gr.Slider(512, 8192, value=4096, label="Max Tokens"),
gr.Slider(0.1, 0.9, value=0.7, label="top_p"
),
gr.Slider(10, 90, value=40, label="top_k"),
]
)
demo.launch(share=True)