import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    trust_remote_code=True,
)


def generate(prompt, length):
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False)
    outputs = model.generate(**inputs, max_length=length if length >= len(inputs) else len(inputs))
    return tokenizer.batch_decode(outputs)[0]


demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(
            label="prompt",
            value="Write a detailed analogy between mathematics and a lighthouse.",
        ),
        gr.Number(value=100, label="max length", maximum=1000),
    ],
    outputs="text",
    examples=[
        [
            "Instruct: Write a detailed analogy between mathematics and a lighthouse.",
            50,
        ],
        [
            "Instruct: Write a detailed analogy between mathematics and a lighthouse.\nOutput:",
            50,
        ],
        [
            "Alice: I don't know why, I'm struggling to maintain focus while studying. Any suggestions?\nBob: ",
            100,
        ],
        [
            '''def print_prime(n):
   """
   Print all primes between 1 and n
   """\n''',
            200,
        ],
    ],
    title="Microsoft Phi-2",
    description="Unofficial demo of Microsoft Phi-2, a high performing model with only 2.7B parameters.",
)


if __name__ == "__main__":
    demo.launch(show_api=False)