|
|
|
import torch |
|
import tiktoken |
|
import gradio as gr |
|
from safetensors.torch import load_file |
|
from huggingface_hub import hf_hub_download |
|
import spaces |
|
|
|
n_layers = 24 |
|
n_heads = 16 |
|
embed_dim = 1024 |
|
ffn_dim = embed_dim * 4 |
|
n_vocab = 50257 |
|
max_seq_len = 740 |
|
dropout = 0.0 |
|
|
|
|
|
update_interval = 14 |
|
|
|
@spaces.GPU |
|
def load_model(): |
|
from model import GPTModel |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
tokenizer = tiktoken.encoding_for_model('gpt2') |
|
model_path = hf_hub_download('cbspace/gpt', 'model.safetensors') |
|
state_dict = load_file(model_path) |
|
|
|
model = GPTModel(device, n_layers, n_heads, embed_dim, ffn_dim, n_vocab, max_seq_len, dropout) |
|
model.load_state_dict(state_dict, strict=False) |
|
|
|
model.eval() |
|
return model, tokenizer, device |
|
|
|
@spaces.GPU(duration=120) |
|
def generate(prompt,out_tokens,top_k_value,temperature): |
|
model.to(device) |
|
outputs = tokenizer.encode(prompt) |
|
tokens_remaining = int(out_tokens) |
|
out_text = prompt |
|
yield out_text |
|
|
|
while tokens_remaining: |
|
new_inputs_len = update_interval if tokens_remaining >= update_interval else tokens_remaining % update_interval |
|
outputs = model.generate(outputs, len(outputs)+new_inputs_len, temperature, top_k=int(top_k_value)) |
|
tokens_remaining -= new_inputs_len |
|
out_text += tokenizer.decode(outputs[-new_inputs_len:]) |
|
yield out_text |
|
|
|
|
|
model, tokenizer, device = load_model() |
|
|
|
app = gr.Interface( |
|
generate, |
|
[ |
|
gr.Textbox(label='Prompt', lines=3), |
|
gr.Number(label='Output Tokens', value=180), |
|
gr.Slider(1, 100, step=5, value=60, label='Top-k Value'), |
|
gr.Slider(0.1, 2.0, step=0.05, value=0.9, label='Temperature') |
|
], |
|
gr.Textbox(label='Output', lines=15, max_lines=15) |
|
) |
|
app.queue().launch(ssr_mode=False, share=True) |