import torch from PIL import Image from transformers import AutoProcessor, AutoModelForVision2Seq from transformers.image_utils import load_image DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Load images image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") image2 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg") # Initialize processor and model processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") model = AutoModelForVision2Seq.from_pretrained( "HuggingFaceTB/SmolVLM-Instruct", torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", ).to(DEVICE) def use_VLM(query: str, image_path: str): image_file = load_image(image=image_path) # Create input messages messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": query} ] }, ] # Prepare inputs prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image_file], return_tensors="pt") inputs = inputs.to(DEVICE) # Generate outputs generated_ids = model.generate(**inputs, max_new_tokens=500) generated_texts = processor.batch_decode( generated_ids, skip_special_tokens=True, ) print(generated_texts[0]) return generated_texts[0] # use_VLM(query="what color is the arrow in the image?", image_path="360_F_640048974_VeXJUEUzGbSzbAeXQwbwnjDOz3ahRjjO.jpg")