Model Card for Model ID

A model purpose made for MCQ generation on cartoon images for lower primary language education. Phi3-mini is used as the LM and CLIP is used as the vision encoder.

Requires a CUDA enabled devices to run. Please run the model using Google Colab with a T4 card, or a local CUDA device with at least 10GB of available VRAM. Ensure that CUDA is built according to your device requirements.

Uses

The model is meant to be used with a custom pipeline. It returns a dict Object with with the following keys

questions (str)
choices (list)
answer (str)
desc (str)

import requests
from transformers import AutoProcessor, LlavaForConditionalGeneration, Pipeline
from transformers.utils import PushToHubMixin
from PIL import Image
import random
import torch

class ImageToQuestionPipeline():
    def __init__(self, llava_model):
        self.processor = AutoProcessor.from_pretrained(llava_model)
        self.llava_model = LlavaForConditionalGeneration.from_pretrained(llava_model, torch_dtype=torch.float16).to(0)

    def __call__(self, image_path):
        commands = ["Generate a simple question\n","Suggest 1 correct answer\n","Suggest 3 incorrect answers\n", ""]
        prompt ='''
            <|user|>\n<image>\nDescribe this image in a passage\n<|end|>\n
            <|assistant|>\n
            '''
        image_file = image_path
        raw_image = Image.open(image_file)
        inputs = self.processor(prompt, raw_image, return_tensors='pt').to(0)
        artifacts = []
        while commands:
          inputs = self.processor(prompt, raw_image, return_tensors='pt').to(0)
          output = self.llava_model.generate(**inputs, eos_token_id=32007, max_new_tokens=500, do_sample=False)
          index = torch.where(output[0]==32001)[0][-1].item()
          text = self.processor.decode(output[0][index:], skip_special_tokens=True)
          artifacts.append(text)
          prompt += "{}<|end|>\n<|user|>\n{}<|end|>\n<|assistant|>\n".format(text,commands.pop(0)) 

        distractors = artifacts.pop(-1)
        a = distractors.split("\n")
        a = [x[3:] for x in a]

        correct_answer = random.randint(0,3)
        a.insert(correct_answer, artifacts[2])

        a = ["{}) {}".format(i+1, a[i]) for i in range(len(a))]
        answer = "Correct Answer: {}".format(correct_answer+1, a[correct_answer])
        result = {}
        result.update({"questions":artifacts[1]})
        result.update({"choices":a})
        result.update({"answer":answer})
        result.update({"desc":artifacts[0]})
        return result


pipe = ImageToQuestionPipeline("Clyine1/phi3_image_question_generator")
output = pipe(<image_file_path>)
print(json.dumps(output, indent=4))

"""
Generated output:
{
    "questions": "What is the color of the shirt the girl in the center of the image is wearing?",
    "choices": [
        "1) The girl in the center of the image is wearing a pink shirt.",
        "2) The girl in the center of the image is wearing a blue shirt.",
        "3) The girl in the center of the image is wearing a red shirt.",
        "4) The girl in the center of the image is wearing a green shirt."
    ],
    "answer": "Correct Answer: 1) The girl in the center of the image is wearing a pink shirt.",
    "desc": "The image depicts a lively scene at a playground. In the foreground, a young girl is sitting on a green slide, her face reflecting a sense of surprise or shock. She is dressed in a pink shirt and a red hat. Behind her, a boy is standing on the same slide, his arms crossed in a defensive posture. He is wearing a red shirt and a blue hat.\n\nIn the background, a girl is sitting on a swing, her legs swinging back and forth. She is wearing a pink shirt and a red hat. Another girl is standing on a blue slide, her arms crossed in a similar defensive posture as the boy on the green slide. She is wearing a blue shirt and a red hat.\n\nIn the distance, a boy is standing on a yellow slide, his arms crossed in a defensive posture. He is wearing a yellow shirt and a red hat. Another boy is standing on a blue slide, his arms crossed in a similar defensive posture as the boy on the yellow slide. He is wearing a blue shirt and a red hat.\n\nThe playground is surrounded by trees and buildings, providing a natural and urban backdrop to the scene. The colors of the playground equipment and the clothing of the children are vibrant and contrasting, adding to the lively atmosphere of the image."
}
"""