import gradio as gr import torch from PIL import Image from huggingface_hub import hf_hub_download import sys import os # Ensure our working directory has the nanoVLM code REPO_ID = "huggingface/nanoVLM" LOCAL_MODEL_DIR = "models" if not os.path.isdir(LOCAL_MODEL_DIR): # clone just the models folder from git import Repo Repo.clone_from("https://github.com/huggingface/nanoVLM.git", ".", depth=1, no_single_branch=True, multi_options=["--filter=blob:none","--sparse"]) # enable sparse checkout of models/ Repo().git.sparse_checkout("set", "models") # Add to path so we can import sys.path.insert(0, os.path.abspath(LOCAL_MODEL_DIR)) from vision_language_model import VisionLanguageModel # Load the VLM model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-222M") model.eval() def predict(img: Image.Image, prompt: str = "") -> str: # Preprocess image, add batch dimension img_tensor = model.preprocess_image(img).unsqueeze(0) # (1, 3, H, W) with torch.no_grad(): # generate_text handles your prompt internally output = model.generate_text(img_tensor, prompt=prompt) return output demo = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=1, placeholder="Prompt (e.g. 'What is in this picture?')", label="Prompt") ], outputs=gr.Textbox(label="Model Output"), title="nanoVLM-222M Vision-Language Demo", description="A minimal Gradio app for image captioning and VQA with nanoVLM-222M." ) if __name__ == "__main__": demo.launch()