Phi-3-Medium / app.py
KingNish's picture
theme removed
93bff60 verified
raw
history blame
No virus
4.43 kB
import gradio as gr
from transformers import TextIteratorStreamer
from threading import Thread
from transformers import StoppingCriteria, StoppingCriteriaList
import torch
import spaces
import os
model_name1 = "microsoft/Phi-3-medium-4k-instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer
model1 = AutoModelForCausalLM.from_pretrained(model_name1, device_map='cuda', torch_dtype=torch.float16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name1)
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
stop_ids = [29, 0]
for stop_id in stop_ids:
if input_ids[0][-1] == stop_id:
return True
return False
@spaces.GPU(duration=20, queue=False)
def predict1(message, history, temperature, max_tokens, top_p, top_k):
history_transformer_format = history + [[message, ""]]
stop = StopOnTokens()
messages = "".join(["".join(["\n<|end|>\n<|user|>\n"+item[0], "\n<|end|>\n<|assistant|>\n"+item[1]]) for item in history_transformer_format])
model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
stopping_criteria=StoppingCriteriaList([stop])
)
t = Thread(target=model1.generate, kwargs=generate_kwargs)
t.start()
partial_message = ""
for new_token in streamer:
if new_token != '<':
partial_message += new_token
yield partial_message
model_name = "microsoft/Phi-3-medium-128k-instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda', torch_dtype=torch.float16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
stop_ids = [29, 0]
for stop_id in stop_ids:
if input_ids[0][-1] == stop_id:
return True
return False
@spaces.GPU(duration=40, queue=False)
def predict(message, history, temperature, max_tokens, top_p, top_k):
history_transformer_format = history + [[message, ""]]
stop = StopOnTokens()
messages = "".join(["".join(["\n<|end|>\n<|user|>\n"+item[0], "\n<|end|>\n<|assistant|>\n"+item[1]]) for item in history_transformer_format])
model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
stopping_criteria=StoppingCriteriaList([stop])
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_message = ""
for new_token in streamer:
if new_token != '<':
partial_message += new_token
yield partial_message
with gr.Blocks() as min:
gr.ChatInterface(
fn=predict1,
title="Phi-3-medium-4k-instruct",
additional_inputs=[
gr.Slider(0.1, 0.9, value=0.7, label="Temperature"),
gr.Slider(512, 4096, value=4096, label="Max Tokens"),
gr.Slider(0.1, 0.9, value=0.7, label="top_p"),
gr.Slider(10, 90, value=40, label="top_k"),
]
)
with gr.Blocks() as max:
gr.ChatInterface(
fn=predict,
title="Phi-3-medium-128k-instruct",
additional_inputs=[
gr.Slider(0.1, 0.9, value=0.7, label="Temperature"),
gr.Slider(64000, 128000, value=100000, label="Max Tokens"),
gr.Slider(0.1, 0.9, value=0.7, label="top_p"),
gr.Slider(10, 90, value=40, label="top_k"),
]
)
with gr.Blocks(title="Phi 3 Medium DEMO") as demo:
gr.Markdown("# Phi3 Medium all in one")
gr.TabbedInterface([max, min], ['Phi3 medium 128k','Phi3 medium 4k'])
demo.launch(share=True)