import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import spaces

MODEL_PATH = "benhaotang/mistral-small-physics-finetuned-bnb-4bit"
MODEL_URL = f"https://huggingface.co/{MODEL_PATH}"

def load_model():
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=False,
        llm_int8_enable_fp32_cpu_offload=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        "benhaotang/mistral-small-physics-finetuned-bnb-4bit",
        device_map="auto",
        torch_dtype=torch.float16,
        offload_folder="offload_folder",
        quantization_config=bnb_config
    )
    
    tokenizer = AutoTokenizer.from_pretrained("benhaotang/mistral-small-physics-finetuned-bnb-4bit")
    return model, tokenizer

model, tokenizer = load_model()

@spaces.GPU(duration=110)  # Added the decorator here
def generate_response(prompt, max_length=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    outputs = model.generate(**inputs, max_length=max_length)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

demo = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(
            label="Enter your physics question",
            placeholder="Ask me anything about physics...",
            lines=5
        ),
    ],
    outputs=gr.Textbox(label="Response", lines=10),
    title="Physics AI Assistant",
    description=f"""Ask questions about physics concepts, and I'll provide detailed explanations.
    
Model: [benhaotang/mistral-small-physics-finetuned-bnb-4bit]({MODEL_URL})""",
    examples=[
        ["Give me a short introduction to renormalization group(RG) flow in physics?"],
        ["What is quantum entanglement?"],
        ["Explain the concept of gauge symmetry in physics."]
    ]
)

demo.launch()