import gradio as gr import spaces import os import torch from transformers import AutoProcessor, MllamaForConditionalGeneration from PIL import Image # Hugging Face token hf_token = os.getenv("HUGGING_FACE_HUB_TOKEN") if not hf_token: raise ValueError("HUGGING_FACE_HUB_TOKEN not found.") # Model model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" model = MllamaForConditionalGeneration.from_pretrained( model_name, token=hf_token, torch_dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained(model_name, use_auth_token=hf_token) @spaces.GPU def predict(image, text): messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": text} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(image, input_text, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=250) response = processor.decode(outputs[0], skip_special_tokens=True) # Split the response at the first occurrence of "assistant" and return only the part after it for a clean output. response = response.split("assistant", 1)[1].strip() return f"\n{response}" # Gradio interface = gr.Interface( fn=predict, inputs=[ gr.Image(type="pil", label="Image Input"), gr.Textbox(label="Text Input") ], outputs=gr.Textbox(label="Output"), title="Llama 3.2 11B Vision Instruct Chat", description="Image + text chat." ) interface.launch()