ToyTransformer / webui.py
larryvrh's picture
Initial commit
2a566c9
raw
history blame
5.69 kB
import gradio as gr
from modeling import global_config, ToyTransformer, AttentionBackend
import torch
from tokenizers import TRIETokenizer
from threading import Thread
import bisect
if torch.cuda.is_available():
g_device = torch.device('cpu')
else:
g_device = torch.device('cpu')
global_config['attn_backend'] = AttentionBackend.Naive
g_SEQ_LEN = 1024
g_HIDDEN_SIZE = 768
g_NUM_HEADS = 12
g_NUM_LAYERS = 12
g_DTYPE = torch.float32
g_tokenizer = TRIETokenizer('llama_vocab_pruned_32k.json')
g_model = ToyTransformer(g_tokenizer.get_vocab_size(), g_NUM_LAYERS, g_NUM_HEADS, g_HIDDEN_SIZE, g_SEQ_LEN, g_device, g_DTYPE)
g_model.load_state_dict(torch.load('model.pt', map_location='cpu'))
def generate(model, tokenizer, prompt, temperature, top_p, rep_penalty,
max_new_tokens=20, total_tokens=None,
end_tokens=None,
enable_kv_cache=True):
model.eval()
feed_tokens = tokenizer.encode(prompt) if isinstance(prompt, str) else prompt
all_tokens = feed_tokens.copy()
if total_tokens is not None:
max_new_tokens = max(0, total_tokens - len(feed_tokens))
with torch.no_grad():
kv_cache = None
for _ in range(max_new_tokens):
logits, kv_cache = model.forward(
torch.tensor([feed_tokens if enable_kv_cache else all_tokens]).to(model.device),
kv_cache=kv_cache)
logits = logits[0][-1].cpu()
if not enable_kv_cache:
kv_cache = None
# apply repetition penalty
logits_rep = torch.gather(logits, 0, torch.tensor(all_tokens))
logits_rep = torch.where(logits_rep < 0, logits_rep * rep_penalty, logits_rep / rep_penalty)
logits.scatter_(0, torch.tensor(all_tokens), logits_rep)
# apply temperature
logits /= max(temperature, 1e-6)
probs = torch.softmax(logits, dim=0)
# apply top-p
ordered_probs, ordered_indices = torch.sort(probs, descending=True)
cum_probs = torch.cumsum(ordered_probs, dim=0).tolist()
top_p_index = bisect.bisect_right(cum_probs, top_p) + 1
ordered_probs, ordered_indices = ordered_probs[:top_p_index], ordered_indices[:top_p_index]
sampled_index = ordered_indices[torch.multinomial(ordered_probs, num_samples=1).item()].item()
all_tokens.append(sampled_index)
feed_tokens = [sampled_index]
if end_tokens is not None and sampled_index in end_tokens:
break
yield feed_tokens
return
def predict(user_input, history, max_length, top_p, temperature, rep_penalty, retry):
if retry and len(history) == 0:
yield []
return
elif retry:
user_input = history[-1][0]
history = history[:-1]
history.append((user_input, ""))
encoded_inputs = [(g_tokenizer.encode('User:' + h[0]), g_tokenizer.encode('Assistant:' + h[1])) for h in history]
taken_rounds, taken_rounds_length = [], 0
while len(taken_rounds) < len(encoded_inputs):
round_pair = encoded_inputs[len(encoded_inputs) - 1 - len(taken_rounds)]
if len(round_pair[0]) + len(round_pair[1]) + taken_rounds_length >= g_SEQ_LEN - max_length:
break
taken_rounds.append(round_pair)
taken_rounds_length += len(round_pair[0]) + len(round_pair[1])
taken_rounds = taken_rounds[::-1]
input_tokens = g_tokenizer.encode('<s>A chat between User and Assistant.')
for round_pair in taken_rounds:
input_tokens += g_tokenizer.encode('\n') + round_pair[0] + g_tokenizer.encode('\n') + round_pair[1]
# print(taken_rounds, g_tokenizer.decode(input_tokens))
for response in generate(g_model, g_tokenizer, input_tokens, temperature, top_p, rep_penalty, max_length, end_tokens=g_tokenizer.encode('</s>')):
history[-1] = (history[-1][0], history[-1][1] + g_tokenizer.decode(response))
yield history
def main():
css = '''
.contain {max-width:50}
#chatbot {min-height:500px}
'''
with gr.Blocks(css=css) as demo:
gr.HTML('<h1 align="center">ToyTransformer</h1>')
chatbot = gr.Chatbot(elem_id='chatbot')
with gr.Column():
user_input = gr.Textbox(show_label=False, placeholder="输入", lines=1, container=False)
with gr.Row():
submitBtn = gr.Button("Send", variant="primary")
retryBtn = gr.Button("Retry")
cancelBtn = gr.Button('Undo')
emptyBtn = gr.Button("Clear")
with gr.Row():
max_length = gr.Slider(0, 512, value=200, step=1, label="Max Response Tokens", interactive=True)
top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top-P", interactive=True)
temperature = gr.Slider(0, 1, value=0.5, step=0.01, label="Temperature", interactive=True)
rep_penalty = gr.Slider(1.0, 1.5, value=1.1, step=0.01, label='Repetition Penalty', interactive=True)
submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, rep_penalty, gr.State(False)],
[chatbot], show_progress=False)
submitBtn.click(lambda: '', [], [user_input], show_progress=False)
retryBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, rep_penalty, gr.State(True)],
[chatbot], show_progress=False)
cancelBtn.click(lambda m: m[:-1], [chatbot], [chatbot], show_progress=False)
emptyBtn.click(lambda: [], outputs=[chatbot], show_progress=False)
demo.queue().launch(share=False, inbrowser=True)
main()