I am using this model but I am unable to generate the response in more than a word, for example,
my question is describe this picture it response me, No.

here is code,

import requests
from PIL import Image
import matplotlib.pyplot as plt
from transformers import BlipProcessor, BlipForQuestionAnswering

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

Display the image

plt.imshow(raw_image)
plt.axis('off')
plt.show()

question = "how many dogs are in the picture?"
inputs = processor(raw_image, question, return_tensors="pt").to("cuda")

out = model.generate(**inputs)

Decode and print every possible output

for i in range(out.shape[0]):
answer = processor.decode(out[i], skip_special_tokens=True)
print(f"Answer {i + 1}: {answer}")

import requests from PIL import Image from transformers import BlipProcessor, BlipForQuestionAnswering processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda") img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') question = "how many dogs are in the picture?" inputs = processor(raw_image, question, return_tensors="pt").to("cuda") out = model.generate(**inputs) print(processor.decode(out[0], skip_special_tokens=True)) >>> 1

Salesforce
/

blip-vqa-base

How I can generate the ttext more than a single word?

Display the image

Decode and print every possible output