import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datetime import datetime

print('{}:loading...'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')
model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3-mini-128k-instruct', torch_dtype='auto', trust_remote_code=True)

if torch.cuda.is_available():
	model.half()
	model = model.to('cuda')

generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=model.device)
print('{}:done.'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

def generate(input_text, maxlen):
	messages = [
		{'role': 'user', 'content': input_text}
	]
	output = generator(
		messages,
		max_new_tokens=maxlen,
		return_full_text=False,
		do_sample=True,
		temperature=0.7,
	)
	generated_text = output[0]['generated_text']
	return generated_text

with gr.Blocks(title='phi3 demo') as app:
	gr.Markdown('# Phi3 Demo')

	chatbot = gr.Chatbot(label='answer')
	msg = gr.Textbox(label='question')
	maxlen = gr.Slider(minimum=30, maximum=100, value=30, step=1, label='max length')
	clear = gr.ClearButton([msg, chatbot])

	def respond(message, maxlen, chat_history):
		if message == '':
			return '', chat_history
		bot_message = generate(message, maxlen)
		chat_history.append((message, bot_message))
		return '', chat_history

	msg.submit(respond, [msg, maxlen, chatbot], [msg, chatbot], concurrency_limit=20)

app.launch()