import os if os.environ.get("SPACES_ZERO_GPU") is not None: import spaces else: class spaces: @staticmethod def GPU(func): def wrapper(*args, **kwargs): return func(*args, **kwargs) return wrapper import gradio as gr import requests import torch from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor device = "cuda" if torch.cuda.is_available() else "cpu" #model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" model_id = "unsloth/Llama-3.2-11B-Vision-Instruct" #model_id = "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit" url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" model = MllamaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto") processor = AutoProcessor.from_pretrained(model_id) @spaces.GPU(duration=30) def infer(message: str, url: str): kwargs = {} image = Image.open(requests.get(url, stream=True).raw) if url and "http" in url else "" if image: kwargs["images"] = image messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": message} ]} ] input_text = processor.apply_chat_template(messages, add_generation_prompt=True) #input_text = "<|image|> If I had to write a haiku for this one, it would be: " inputs = processor( text=input_text, add_special_tokens=False, return_tensors="pt", **kwargs, ).to(model.device) output = model.generate(**inputs, max_new_tokens=30) output_str = processor.decode(output[0]) print(message) print(url) print(output_str) return output_str with gr.Blocks() as demo: with gr.Row(): message = gr.Textbox(label="Message", value="Describe the image.", lines=1) image_url = gr.Textbox(label="Image URL", value=url, lines=1) run_button = gr.Button("Run", variant="primary") info_md = gr.Markdown("


") run_button.click(infer, [message, image_url], [info_md]) demo.launch() """ Describe the image. https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg <|begin_of_text|><|start_header_id|>user<|end_header_id|> <|image|>Describe the image.<|eot_id|><|start_header_id|>assistant<|end_header_id|> This image features a charming anthropomorphic rabbit, attired in a brown waistcoat and tan pants, with a blue coat draped over his shoulders, standing If I had to write a haiku for this one, it would be: <|begin_of_text|><|start_header_id|>user<|end_header_id|> <|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|> It seems like you started to write a haiku but didn't finish. Would you like to complete it?<|eot_id|> Who are you? <|begin_of_text|><|start_header_id|>user<|end_header_id|> <|image|>Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|> I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|> """