import gradio as gr
import torch
from PIL import Image
from huggingface_hub import hf_hub_download
import sys
import os

# Ensure our working directory has the nanoVLM code
REPO_ID = "huggingface/nanoVLM"
LOCAL_MODEL_DIR = "models"
if not os.path.isdir(LOCAL_MODEL_DIR):
    # clone just the models folder
    from git import Repo
    Repo.clone_from("https://github.com/huggingface/nanoVLM.git", ".", depth=1, no_single_branch=True, multi_options=["--filter=blob:none","--sparse"])
    # enable sparse checkout of models/
    Repo().git.sparse_checkout("set", "models")

# Add to path so we can import
sys.path.insert(0, os.path.abspath(LOCAL_MODEL_DIR))

from vision_language_model import VisionLanguageModel

# Load the VLM
model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-222M")
model.eval()

def predict(img: Image.Image, prompt: str = "") -> str:
    # Preprocess image, add batch dimension
    img_tensor = model.preprocess_image(img).unsqueeze(0)  # (1, 3, H, W)
    with torch.no_grad():
        # generate_text handles your prompt internally
        output = model.generate_text(img_tensor, prompt=prompt)
    return output

demo = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(lines=1, placeholder="Prompt (e.g. 'What is in this picture?')", label="Prompt")
    ],
    outputs=gr.Textbox(label="Model Output"),
    title="nanoVLM-222M Vision-Language Demo",
    description="A minimal Gradio app for image captioning and VQA with nanoVLM-222M."
)

if __name__ == "__main__":
    demo.launch()