import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from datetime import datetime print('{}:loading...'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct') model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3-mini-128k-instruct', torch_dtype='auto', trust_remote_code=True) if torch.cuda.is_available(): model.half() model = model.to('cuda') generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=model.device) print('{}:done.'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) def generate(input_text, maxlen): messages = [ {'role': 'user', 'content': input_text} ] output = generator( messages, max_new_tokens=maxlen, return_full_text=False, do_sample=True, temperature=0.7, ) generated_text = output[0]['generated_text'] return generated_text with gr.Blocks(title='phi3 demo') as app: gr.Markdown('# Phi3 Demo') chatbot = gr.Chatbot(label='answer') msg = gr.Textbox(label='question') maxlen = gr.Slider(minimum=30, maximum=100, value=30, step=1, label='max length') clear = gr.ClearButton([msg, chatbot]) def respond(message, maxlen, chat_history): if message == '': return '', chat_history bot_message = generate(message, maxlen) chat_history.append((message, bot_message)) return '', chat_history msg.submit(respond, [msg, maxlen, chatbot], [msg, chatbot], concurrency_limit=20) app.launch()