The 8b model could get correct results for case showed on the offical blog

#28
by YuntaoChen - opened

My inference code is list below.
Fuyu-8b gives reasonable results for input text & image prompts examples on the huggingface page, but outputs non-sense for anything else.
See "life expectancy case v2" and "life expectancy cas v3" for results of just slightly modifying the text prompts.
See "big ben coco caption case" and other cases below for text prompts and image prompts used by the offical blog.

I'm wondering if this is the expected behavior of fuyu-8b or I made some mistake in the inference code part.

import torch
from transformers import FuyuProcessor, FuyuForCausalLM
from PIL import Image

# load model and processor
model_id = "adept/fuyu-8b"
processor = FuyuProcessor.from_pretrained(model_id)
model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0", torch_dtype=torch.float16)


def inference_lmm(text_prompt, image_path):
    image = Image.open(image_path)

    inputs = processor(text=text_prompt, images=image, return_tensors="pt")
    for k, v in inputs.items():
        inputs[k] = v.to("cuda:0")

    # autoregressively generate text
    generation_output = model.generate(**inputs, max_new_tokens=50)
    generation_text = processor.batch_decode(generation_output[:, -50:], skip_special_tokens=True)
    print(f"input text: {text_prompt}")
    print(f"generated text: {generation_text}")


# coco caption case
text_prompt = "Generate a coco-style caption.\n"
image_path = "bus.png"
inference_lmm(text_prompt, image_path)

# bus color case
text_prompt = "What color is the bus?\n"
image_path = "bus.png"
inference_lmm(text_prompt, image_path)

# life expectancy case
text_prompt = "What is the highest life expectancy at birth of male?\n"
image_path = "chart.png"
inference_lmm(text_prompt, image_path)

# life expectancy case v2
text_prompt = "What is the highest life expectancy at birth of female?\n"
image_path = "chart.png"
inference_lmm(text_prompt, image_path)

# life expectancy case v3
text_prompt = "What is the lowest life expectancy at birth of male?\n"
image_path = "chart.png"
inference_lmm(text_prompt, image_path)

# big ben coco caption case
text_prompt = "Generate a coco-style caption.\n"
image_path = "big_ben.png"
inference_lmm(text_prompt, image_path)

# hbo case
text_prompt = "Aidan Gillen acted in how many series?"
image_path = "hbo.png"
inference_lmm(text_prompt, image_path)

# twitter_graph case
text_prompt = "Find missing data of the sequence 24, _ ,32, 33, 42?"
image_path = "twitter_graph.png"
inference_lmm(text_prompt, image_path)

# vacation_days case
text_prompt = "What was the fair amount of paid vacation days in the UK?"
image_path = "vacation_days.png"
inference_lmm(text_prompt, image_path)

# job case
text_prompt = "Which is the metro in California that has a good job Outlook?"
image_path = "job.png"
inference_lmm(text_prompt, image_path)

# pdf case
text_prompt = "What was the pack spinner capacity?"
image_path = "pdf.png"
inference_lmm(text_prompt, image_path)

# leaf_shapes case
text_prompt = "What letter does a keel-shaped cross-section look like?"
image_path = "leaf_shapes.png"
inference_lmm(text_prompt, image_path)

# red_tree_vole case
text_prompt = "If in the food web shown in the diagram, Douglas fir tree needles are absent, which organism would starve?"
image_path = "red_tree_vole.png"
inference_lmm(text_prompt, image_path)
input text: Generate a coco-style caption.
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> Generate a coco-style caption.\n\x04 A bus parked on the side of a road.']

input text: What color is the bus?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What color is the bus?\n\x04 The bus is blue.\n']

input text: What is the highest life expectancy at birth of male?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What is the highest life expectancy at birth of male?\n\x04 The life expectancy at birth of males in 2018 is 80.7.\n']

input text: What is the highest life expectancy at birth of female?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What is the highest life expectancy at birth of female?\n\x04 The life expectancy at birth of female is 80.2.\n']

input text: What is the lowest life expectancy at birth of male?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What is the lowest life expectancy at birth of male?\n\x04 The life expectancy at birth of males in 2018 is 80.7.\n']

input text: Generate a coco-style caption.
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> Generate a coco-style caption.\n\x04 The city is lit up at night with traffic and lights.']

input text: Aidan Gillen acted in how many series?
generated text: ['|NEWLINE||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> Aidan Gillen acted in how many series?\x04 3']

input text: Find missing data of the sequence 24, _ ,32, 33, 42?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> Find missing data of the sequence 24, _ ,32, 33, 42?\x04 32, 33, 42']

input text: What was the fair amount of paid vacation days in the UK?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What was the fair amount of paid vacation days in the UK?\x04 How many days did the UK spend in paid vacation in 2019?']

input text: Which is the metro in California that has a good job Outlook?
generated text: ['"PCO Configurations" "Time Savers, New Orleans - 1977" "Results" "A Single PCD with a capacity of 104 packs was used in the test." "Results" "Sales of brand style']

input text: What was the pack spinner capacity?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What was the pack spinner capacity?\x04 Each display was loaded with various styles of vantage, more and/or NOW.']

input text: What letter does a keel-shaped cross-section look like?
generated text: ['|SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||SPEAKER||NEWLINE|<s> What letter does a keel-shaped cross-section look like?\x04 B']

input text: If in the food web shown in the diagram, Douglas fir tree needles are absent, which organism would starve?
generated text: ['Black-tailed Deer.\n\nThe food web in the image shows deer, carpenter ants, fern, lichen, cougars, and mountain beaver as food sources. If the deer were to die, the carpenter ants would']

I think we are gonna release an update to make sure the begin of sentence and end of sentence are properly added to the sequences, making sure our results match the demos! cc @Molbap

Hi @YuntaoChen , thanks a lot for the thorough testing, it was useful! We have an update as part of the latest transformers release:
First of all, now the model supports batching so you can input a list of prompts and a equal length list of images to the processor to get the model inputs ready for generation! And second, the prompt structure does matter as you have to prompt the model to answer in a VQA fashion. We've also updated a few examples to better reflect the capabilities of the released model. Try this out:

from PIL import Image
import requests
import io
from transformers import FuyuForCausalLM, FuyuProcessor

pretrained_path = "adept/fuyu-8b"
processor = FuyuProcessor.from_pretrained(pretrained_path)
model = FuyuForCausalLM.from_pretrained(pretrained_path, device_map='auto')


text_prompt = "Answer the following DocVQA question based on the image. \n Which is the metro in California that has a good job Outlook?"
jobs_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/jobs.png"
jobs_image_pil = Image.open(io.BytesIO(requests.get(jobs_image_url).content))

second_text_prompt = "Answer the following DocVQA question based on the image. \n What if the maximum male life expectancy?"
chart_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/chart.png"
chart_image_pil = Image.open(io.BytesIO(requests.get(chart_image_url).content))

third_text_prompt = "Answer the following DocVQA question based on the image. \n What sport is that?"
skate_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/skateboard.png"
skate_image_pil = Image.open(io.BytesIO(requests.get(skate_image_url).content))

fourth_text_prompt = "Answer the following DocVQA question based on the image. \n What was the fair amount of paid vacation days in the United Kingdom?"
vacations_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/vacation_days_hr.png"
vacations_image_pil = Image.open(io.BytesIO(requests.get(vacations_image_url).content)).convert('RGB')

fifth_text_prompt = "Answer the following VQAv2 question based on the image: What type of foods are in the image?"
fish_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/fish_carrots.png"
fish_image_pil = Image.open(io.BytesIO(requests.get(fish_image_url).content))


texts = [text_prompt, second_text_prompt, third_text_prompt, fourth_text_prompt, fifth_text_prompt]
images = [jobs_image_pil, chart_image_pil, skate_image_pil, vacations_image_pil, fish_image_pil]

model_inputs = processor(text=texts, images=images).to('cuda')


model_outputs = processor.tokenizer.batch_decode(model.generate(
    **model_inputs, max_new_tokens=10)[:, -10:], skip_special_tokens=True)

ground_truths = ['Los Angeles', '80.7', 'skateboarding', '28', 'fish, carrots, lemon']


for ground_truth, model_output in zip(ground_truths, model_outputs):
    prediction = model_output.split('\x04 ', 1)[1] if '\x04' in model_output else ''
    assert (ground_truth == prediction)

Sign up or log in to comment