import time
import torch
import gradio as gr
import torch._dynamo as dynamo


model = torch.load("GPT2Model.pt")
tokenizer = torch.load("GPT2Tokenizer.pt")
inductor_model = dynamo.optimize("inductor")(model)
tvm_model = dynamo.optimize("tvm")(model)

def timed(fn):
    start = time.time()
    result = fn()
    end = time.time() - start
    return result, float("{:.5f}".format(end))


def gpt2(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    eager_outputs, eager_time = timed(lambda: model.generate(input_ids, do_sample=False, max_length=30))
    inductor_outputs, inductor_time = timed(lambda: inductor_model.generate(input_ids, do_sample=False, max_length=30))
    tvm_outputs, tvm_time = timed(lambda: tvm_model.generate(input_ids, do_sample=False, max_length=30))
    if torch.allclose(eager_outputs, inductor_outputs) and torch.allclose(eager_outputs, tvm_outputs):
        actual_output = tokenizer.batch_decode(eager_outputs, skip_special_tokens=True)[0]
    else:
        actual_output = "Result is not correct between dynamo and eager!"
    expect_output = f"Torch eager takes: {eager_time} sec\n"
    expect_output += f"Inductor takes: {inductor_time} sec with " + "{:.2}x speedup\n".format(eager_time/inductor_time)
    expect_output += f"TVM takes: {tvm_time} sec with " + "{:.2}x speedup\n".format(eager_time/tvm_time)
    expect_output += f"Output: {actual_output}"
    return expect_output

demo = gr.Interface(fn=gpt2, inputs="text", outputs="text")

demo.launch()