File size: 6,273 Bytes
03561be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""

 Copyright (c) 2022, salesforce.com, inc.

 All rights reserved.

 SPDX-License-Identifier: BSD-3-Clause

 For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

"""

import json
import os
import random

import numpy as np
from PIL import Image
from transformers import LlamaTokenizer

from .vqa_dataset import VQADataset

QUESTIONS = [
    "please describe the image",
    "can you describe the image",
    "Could you provide a description of the image?",
    "What do you see in this image?",
    "Share your thoughts on the content of the image.",
    "Please narrate what's happening in the picture.",
    "Can you give a brief explanation of the image?",
    "Describe the main elements and details present in the image.",
    "In your own words, what is depicted in the image?",
    "Can you outline the key aspects of the image?",
    "What are the most striking features in this image?",
    "Please provide a summary of the image's content.",
    "Describe the overall theme or concept captured in the image.",
    "How would you explain the image's composition and focus?",
    "What is the focal point or main subject of the image?",
    "How do the different components of the image interact with each other?",
    "What would be a fitting caption for this image?",
    "Can you create a concise description that captures the essence of the image?",
    "How would you briefly summarize the content of this image in a phrase or sentence?",
    "Please provide a catchy and relevant caption for this picture.",
    "If you were to give this image a title, what would it be?",
    "Describe the image in one creative sentence.",
    "Please suggest a memorable phrase that encapsulates the image's content.",
    "What engaging phrase would best represent this image?",
    "Can you create an expressive caption that highlights the main theme of the image?",
    "How would you sum up the image's story for a caption?",
    "Provide an eye-catching caption that conveys the image's core message.",
    "If you were to give this image a headline, what would it say?",
    "Can you craft a captivating caption that communicates the essence of the image?",
    "How would you describe the image's content in a powerful caption?",
    "Please provide an inventive title to summarize the scene depicted in the image.",
    "Compose a concise and striking phrase that reflects the image's key elements.",
    "If you were to create a caption for this image, what would it be?",
    "Offer a compelling caption that highlights the central focus of the image.",
    "Can you produce a unique caption that encapsulates the image's overall mood?",
    "Please generate an attention-grabbing caption that would best illustrate the events captured in this image",
    "How would you express the image's main idea in an impactful sentence?",
    "Please create a vivid and concise title that conveys the essence of the picture.",
    "Compose an imaginative caption that reflects the image's most striking features.",
    "What memorable statement would best represent the scene illustrated in this image?",
    "Draft an evocative caption that brings the image to life for the reader.",
    "Can you suggest an insightful caption that highlights the underlying message of the image?",
    "What engaging phrase would effectively convey the action or subject matter depicted in this picture?",
    "How would you encapsulate the image's core theme in a concise and expressive manner?",
    "Please provide a creative and impactful title that captures the spirit of the image.",
    "Craft a captivating caption that showcases the image's most prominent attributes.",
    "What intriguing statement would best sum up the scene presented in this image?",
    "Develop a descriptive caption that paints a vivid picture for the viewer.",
    "Can you give a detailed account of the image's contents?",
    "What are the key elements and features visible in this image?",
    "How would you narrate the events or actions depicted in the picture?",
    "Please share your observations about the various components present in the image.",
    "What is the overall theme or concept captured in this image? Can you describe it?",
]


class COCOCaptionDataset(VQADataset):
    def __init__(

        self, tokenizer, vis_processor=None, vis_root=None, ann_paths=[], add_eos=True, ignore_instruction=True

    ):
        """

        vis_root (string): Root directory of images (e.g. coco/images/)

        ann_root (string): directory to store the annotation file

        """
        self.tokenizer: LlamaTokenizer = tokenizer
        self.vis_root = vis_root

        self.annotation = []
        for ann_path in ann_paths:
            self.annotation.extend(json.load(open(ann_path, "r")))

        self.vis_processor = vis_processor

        instructions = []
        for question in QUESTIONS:
            # instruction = f"Below is a question about an image. Write a response to answer the question.\n\n### Image:\n<image>\n\n### Question:\n{question}\n\n### Answer:\n".format(
            #    question
            # )
            instruction = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Image:\n{image}\n\n### Instruction:\n{question}\n\n### Response:\n".format(
                image="<image>", question=question
            )
            instructions.append(instruction)
        self.instructions = instructions
        self.add_eos = add_eos
        self.ignore_instruction = ignore_instruction

    def process_image(self, ann):
        image_path = os.path.join(self.vis_root, ann["image"])
        image = Image.open(image_path).convert("RGB")

        image = self.vis_processor(image)
        return image

    def process_text(self, ann):
        all_captions = ann["caption"]
        if not isinstance(all_captions, list):
            all_captions = [all_captions]
        caption = random.choice(all_captions)
        instruction = random.choice(self.instructions)

        return dict(instruction=instruction, answer=caption)