import gradio as gr
from modeling import global_config, ToyTransformer, AttentionBackend
import torch
from tokenizers import TRIETokenizer
from threading import Thread
import bisect
if torch.cuda.is_available():
g_device = torch.device('cpu')
else:
g_device = torch.device('cpu')
global_config['attn_backend'] = AttentionBackend.Naive
g_SEQ_LEN = 1024
g_HIDDEN_SIZE = 768
g_NUM_HEADS = 12
g_NUM_LAYERS = 12
g_DTYPE = torch.float32
g_tokenizer = TRIETokenizer('llama_vocab_pruned_32k.json')
g_model = ToyTransformer(g_tokenizer.get_vocab_size(), g_NUM_LAYERS, g_NUM_HEADS, g_HIDDEN_SIZE, g_SEQ_LEN, g_device, g_DTYPE)
g_model.load_state_dict(torch.load('model.pt', map_location='cpu'))
def generate(model, tokenizer, prompt, temperature, top_p, rep_penalty,
max_new_tokens=20, total_tokens=None,
end_tokens=None,
enable_kv_cache=True):
model.eval()
feed_tokens = tokenizer.encode(prompt) if isinstance(prompt, str) else prompt
all_tokens = feed_tokens.copy()
if total_tokens is not None:
max_new_tokens = max(0, total_tokens - len(feed_tokens))
with torch.no_grad():
kv_cache = None
for _ in range(max_new_tokens):
logits, kv_cache = model.forward(
torch.tensor([feed_tokens if enable_kv_cache else all_tokens]).to(model.device),
kv_cache=kv_cache)
logits = logits[0][-1].cpu()
if not enable_kv_cache:
kv_cache = None
# apply repetition penalty
logits_rep = torch.gather(logits, 0, torch.tensor(all_tokens))
logits_rep = torch.where(logits_rep < 0, logits_rep * rep_penalty, logits_rep / rep_penalty)
logits.scatter_(0, torch.tensor(all_tokens), logits_rep)
# apply temperature
logits /= max(temperature, 1e-6)
probs = torch.softmax(logits, dim=0)
# apply top-p
ordered_probs, ordered_indices = torch.sort(probs, descending=True)
cum_probs = torch.cumsum(ordered_probs, dim=0).tolist()
top_p_index = bisect.bisect_right(cum_probs, top_p) + 1
ordered_probs, ordered_indices = ordered_probs[:top_p_index], ordered_indices[:top_p_index]
sampled_index = ordered_indices[torch.multinomial(ordered_probs, num_samples=1).item()].item()
all_tokens.append(sampled_index)
feed_tokens = [sampled_index]
if end_tokens is not None and sampled_index in end_tokens:
break
yield feed_tokens
return
def predict(user_input, history, max_length, top_p, temperature, rep_penalty, retry):
if retry and len(history) == 0:
yield []
return
elif retry:
user_input = history[-1][0]
history = history[:-1]
history.append((user_input, ""))
encoded_inputs = [(g_tokenizer.encode('User:' + h[0]), g_tokenizer.encode('Assistant:' + h[1])) for h in history]
taken_rounds, taken_rounds_length = [], 0
while len(taken_rounds) < len(encoded_inputs):
round_pair = encoded_inputs[len(encoded_inputs) - 1 - len(taken_rounds)]
if len(round_pair[0]) + len(round_pair[1]) + taken_rounds_length >= g_SEQ_LEN - max_length:
break
taken_rounds.append(round_pair)
taken_rounds_length += len(round_pair[0]) + len(round_pair[1])
taken_rounds = taken_rounds[::-1]
input_tokens = g_tokenizer.encode('A chat between User and Assistant.')
for round_pair in taken_rounds:
input_tokens += g_tokenizer.encode('\n') + round_pair[0] + g_tokenizer.encode('\n') + round_pair[1]
# print(taken_rounds, g_tokenizer.decode(input_tokens))
for response in generate(g_model, g_tokenizer, input_tokens, temperature, top_p, rep_penalty, max_length, end_tokens=g_tokenizer.encode('')):
history[-1] = (history[-1][0], history[-1][1] + g_tokenizer.decode(response))
yield history
def main():
css = '''
.contain {max-width:50}
#chatbot {min-height:500px}
'''
with gr.Blocks(css=css) as demo:
gr.HTML('