from collections import namedtuple

import spaces
import gradio as gr  
import torch  
from transformers import AutoTokenizer, AutoModelForCausalLM

title = """# Minitron Story Generator"""
description = """
# Minitron

Minitron is a family of small language models (SLMs) obtained by pruning [NVIDIA's](https://huggingface.co/nvidia) Nemotron-4 15B model, LLaMA3.1-8B or Mistral NeMO models.
We prune model the number of transformer blocks, embedding size, attention heads, and MLP intermediate dimension, following which, we perform continued training with distillation to arrive at the final models.

# Short Story Generator
Welcome to the Short Story Generator! This application helps you create unique short stories based on your inputs.

This application will show you the output of several models in the Minitron family. Outputs are shown side by side so you can compare them.

**Instructions:**
1. **Main Character:** Describe the main character of your story. For example, "a brave knight" or "a curious cat".
2. **Setting:** Describe the setting where your story takes place. For example, "in an enchanted forest" or "in a bustling city".
3. **Plot Twist:** Add an interesting plot twist to make the story exciting. For example, "discovers a hidden treasure" or "finds a secret portal to another world".

After filling in these details, click the "Submit" button, and a short story will be generated for you.
"""

inputs = [
    gr.Textbox(label="Main Character", placeholder="e.g. a brave knight"),
    gr.Textbox(label="Setting", placeholder="e.g. in an enchanted forest"),
    gr.Textbox(label="Plot Twist", placeholder="e.g. discovers a hidden treasure"),
    gr.Slider(minimum=1, maximum=2048, value=64, step=1, label="Max new tokens"),
    gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
    gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
]

Model = namedtuple('Model', ['name', 'llm', 'tokenizer'])

model_paths = [
    "nvidia/Llama-3.1-Minitron-4B-Width-Base",
    "nvidia/Llama-3.1-Minitron-4B-Depth-Base",
    "nvidia/Mistral-NeMo-Minitron-8B-Base",
]

device='cuda'
dtype=torch.bfloat16

# Load the tokenizers and models.
models = [
    Model(
        name=p.split("/")[-1],
        llm=AutoModelForCausalLM.from_pretrained(p, torch_dtype=dtype, device_map=device),
        tokenizer=AutoTokenizer.from_pretrained(p),
    ) for p in model_paths
]

outputs = [
    gr.Textbox(label=f"Generated Story ({model.name})") for model in models
]

# Define the prompt format  
def create_prompt(instruction):  
    PROMPT = '''Below is an instruction that describes a task.\n\nWrite a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:'''  
    return PROMPT.format(instruction=instruction)  


@spaces.GPU 
def generate_story(character, setting, plot_twist, max_tokens, temperature, top_p):
    """Define the function to generate the story."""
    prompt = f"Write a short story with the following details:\nMain character: {character}\nSetting: {setting}\nPlot twist: {plot_twist}\n\nStory:"
    
    output_texts = []
    
    for model in models:         
        input_ids = model.tokenizer.encode(prompt, return_tensors="pt").to(model.llm.device)
        output_ids = model.llm.generate(input_ids, max_length=max_tokens, num_return_sequences=1, temperature=temperature, top_p=top_p)
        output_text = model.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        output_texts.append(output_text[len(prompt):])      
      
    return output_texts 
   

# Create the Gradio interface
demo = gr.Interface(
    fn=generate_story,
    inputs=inputs,
    outputs=outputs,
    title="Short Story Generator",
    description=description
)
  
if __name__ == "__main__":  
    demo.launch()