# gpt Gradio App by Craig Brennan import torch import tiktoken import gradio as gr from safetensors.torch import load_file from huggingface_hub import hf_hub_download import spaces n_layers = 24 n_heads = 16 embed_dim = 1024 ffn_dim = embed_dim * 4 n_vocab = 50257 max_seq_len = 740 dropout = 0.0 # Number of tokens per update interval update_interval = 14 @spaces.GPU def load_model(): from model import GPTModel device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = tiktoken.encoding_for_model('gpt2') model_path = hf_hub_download('cbspace/gpt', 'model.safetensors') state_dict = load_file(model_path) model = GPTModel(device, n_layers, n_heads, embed_dim, ffn_dim, n_vocab, max_seq_len, dropout) model.load_state_dict(state_dict, strict=False) model.eval() return model, tokenizer, device @spaces.GPU(duration=120) def generate(prompt,out_tokens,top_k_value,temperature): model.to(device) outputs = tokenizer.encode(prompt) tokens_remaining = int(out_tokens) out_text = prompt yield out_text while tokens_remaining: new_inputs_len = update_interval if tokens_remaining >= update_interval else tokens_remaining % update_interval outputs = model.generate(outputs, len(outputs)+new_inputs_len, temperature, top_k=int(top_k_value)) tokens_remaining -= new_inputs_len out_text += tokenizer.decode(outputs[-new_inputs_len:]) yield out_text # Create the model model, tokenizer, device = load_model() app = gr.Interface( generate, [ gr.Textbox(label='Prompt', lines=3), gr.Number(label='Output Tokens', value=180), gr.Slider(1, 100, step=5, value=60, label='Top-k Value'), gr.Slider(0.1, 2.0, step=0.05, value=0.9, label='Temperature') ], gr.Textbox(label='Output', lines=15, max_lines=15) ) app.queue().launch(ssr_mode=False, share=True)