ManishThota
/

Test_Annotator

Model card Files Files and versions Community

ManishThota commited on Jul 4, 2024

Commit

1fea0a0

verified ·

1 Parent(s): 3245c07

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
ChatUniVi/__init__.py +1 -0
ChatUniVi/__pycache__/__init__.cpython-310.pyc +0 -0
ChatUniVi/__pycache__/constants.cpython-310.pyc +0 -0
ChatUniVi/__pycache__/conversation.cpython-310.pyc +0 -0
ChatUniVi/__pycache__/mm_utils.cpython-310.pyc +0 -0
ChatUniVi/__pycache__/utils.cpython-310.pyc +0 -0
ChatUniVi/config/__init__.py +15 -0
ChatUniVi/config/dataset_config.py +41 -0
ChatUniVi/config/mode_config.py +24 -0
ChatUniVi/constants.py +19 -0
ChatUniVi/conversation.py +273 -0
ChatUniVi/demo.py +160 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_1_correctness.py +199 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_1_correctness_mi.py +201 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_2_detailed_orientation.py +199 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_2_detailed_orientation_mi.py +201 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_3_context.py +199 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_3_context_mi.py +201 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_4_temporal.py +198 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_4_temporal_mi.py +200 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_5_consistency.py +207 -0
ChatUniVi/eval/evaluate/evaluate_benchmark_5_consistency_mi.py +208 -0
ChatUniVi/eval/evaluate/evaluate_gpt_review_visual.py +118 -0
ChatUniVi/eval/evaluate/evaluate_science_qa.py +142 -0
ChatUniVi/eval/evaluate/evaluate_video_qa.py +212 -0
ChatUniVi/eval/evaluate/evaluate_video_qa_mi.py +214 -0
ChatUniVi/eval/evaluate/summarize_gpt_review.py +67 -0
ChatUniVi/eval/model_coco_vqa.py +220 -0
ChatUniVi/eval/model_video_consistency.py +252 -0
ChatUniVi/eval/model_video_general.py +213 -0
ChatUniVi/eval/model_video_qa.py +232 -0
ChatUniVi/eval/model_vqa.py +125 -0
ChatUniVi/eval/model_vqa_scienceqa.py +164 -0
ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_gpt4_answer.jsonl +90 -0
ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_questions.jsonl +90 -0
ChatUniVi/eval/questions/coco_pope/coco_pope_adversarial.jsonl +0 -0
ChatUniVi/eval/questions/coco_pope/coco_pope_popular.jsonl +0 -0
ChatUniVi/eval/questions/coco_pope/coco_pope_random.jsonl +0 -0
ChatUniVi/eval/questions/scienceqa/pid_splits.json +0 -0
ChatUniVi/eval/questions/scienceqa/problems.json +3 -0
ChatUniVi/eval/questions/scienceqa/test_QCM-LEA.json +0 -0
ChatUniVi/eval/questions/video_qa/activitynet_a_list.json +2105 -0
ChatUniVi/eval/questions/video_qa/activitynet_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/consistency_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/generic_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/msrvtt_a_list.json +4175 -0
ChatUniVi/eval/questions/video_qa/msrvtt_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/msvd_a_list.json +1202 -0
ChatUniVi/eval/questions/video_qa/msvd_qa.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ChatUniVi/eval/questions/scienceqa/problems.json filter=lfs diff=lfs merge=lfs -text
+examples/image0.jpg filter=lfs diff=lfs merge=lfs -text
+examples/video0.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/video1.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/video2.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/video3.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/video4.mp4 filter=lfs diff=lfs merge=lfs -text

ChatUniVi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import ChatUniViLlamaForCausalLM

ChatUniVi/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (215 Bytes). View file

ChatUniVi/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (752 Bytes). View file

ChatUniVi/__pycache__/conversation.cpython-310.pyc ADDED Viewed

Binary file (7.38 kB). View file

ChatUniVi/__pycache__/mm_utils.cpython-310.pyc ADDED Viewed

Binary file (3.39 kB). View file

ChatUniVi/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

ChatUniVi/config/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .dataset_config import *
+from .mode_config import *
+ModelConfig = {
+    "PRETUNE": model_config_pretune,
+    "FINETUNE": model_config_finetune,
+}
+DataConfig = {
+    "Pretrain": [Pretrain, COCO_CAP, COCO_REG, COCO_REC],
+    "SQA": [SQA],
+    "FINETUNE": [VIT, MIMIC_imageonly, VIDEO],
+}

ChatUniVi/config/dataset_config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+Pretrain = {
+    "chat_path": "${PATH}/CC3M-595K/chat.json",
+    "CC3M": "${PATH}/CC3M-595K",
+}
+VIT = {
+    "chat_path": "${PATH}/llava_instruct_150k.json",
+    "COCO2017": "${PATH}/COCO2017/train2017",
+}
+MIMIC_imageonly = {
+    "chat_path": "${PATH}/MIMIC-IT-imageonly.json",
+    "CDG": "${PATH}/CGD/images",
+    "LA": "${PATH}/LA/images",
+    "SD": "${PATH}/SD/images",
+}
+COCO_CAP = {
+    "chat_path": "${PATH}/COCO/coco_cap_chat.json",
+    "COCO2014": "${PATH}/COCO2014/train2014",
+}
+COCO_REG = {
+    "chat_path": "${PATH}/COCO/coco_reg_chat.json",
+    "COCO2014": "${PATH}/COCO2014/train2014",
+}
+COCO_REC = {
+    "chat_path": "${PATH}/COCO/coco_rec_chat.json",
+    "COCO2014": "${PATH}/COCO2014/train2014",
+}
+VIDEO = {
+    "chat_path": "${PATH}/video_chat.json",
+    "VIDEO": "${PATH}/Activity_Videos",
+}
+SQA = {
+    "chat_path": "${PATH}/llava_train_QCM-LEA.json",
+    "ScienceQA": "${PATH}/scienceqa/train",
+}

ChatUniVi/config/mode_config.py ADDED Viewed

	@@ -0,0 +1,24 @@

+model_config_pretune = {
+    "use_cluster": True,
+    "freeze": False,
+    "vision_tune": False,
+    "spatial_cluster_rate0": 64,  # 0.25
+    "spatial_cluster_rate1": 32,  # 0.5
+    "spatial_cluster_rate2": 16,  # 0.5
+    "temporal_cluster_rate": 1/16,
+}
+model_config_finetune = {
+    "use_cluster": True,
+    "freeze": False,
+    "mm_tune": True,
+    "vision_tune": False,
+    "spatial_cluster_rate0": 64,  # 0.25
+    "spatial_cluster_rate1": 32,  # 0.5
+    "spatial_cluster_rate2": 16,  # 0.5
+    "temporal_cluster_rate": 1/16,
+}

ChatUniVi/constants.py ADDED Viewed

	@@ -0,0 +1,19 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+MAX_IMAGE_LENGTH = 64
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
+DEFAULT_VID_START_TOKEN = "<vid_start>"
+DEFAULT_VID_END_TOKEN = "<vid_end>"
+DEFAULT_BOX_START_TOKEN = "<box_start>"
+DEFAULT_BOX_END_TOKEN = "<box_end>"

ChatUniVi/conversation.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    ret.append([img_str, None])
+                    msg = msg.replace('<image>', '').strip()
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+simple_qa = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions. "
+           " If you don't know the answer to a question, please don't share false information.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Is there a snowboard in the image?"),
+        ("Assistant", "Yes, there is a snowboard in the image, and the person is riding it down a snow-covered slope.")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there! How can I help you today?")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+default_conversation = simple_conv
+conv_templates = {
+    "default": simple_conv,
+    "simple": simple_conv,
+    "simpleqa": simple_qa,
+    "v1": conv_v1,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

ChatUniVi/demo.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+from .constants import *
+from .conversation import conv_templates, SeparatorStyle
+from .model.builder import load_pretrained_model
+from .utils import disable_torch_init
+from .mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
+from PIL import Image
+import os
+from decord import VideoReader, cpu
+import numpy as np
+class Chat:
+    def __init__(self, model_path, conv_mode="simple", load_8bit=False, load_4bit=False):
+        disable_torch_init()
+        self.tokenizer, self.model, self.image_processor, context_len = load_pretrained_model(model_path, None, model_name="ChatUniVi", load_8bit=load_8bit, load_4bit=load_4bit)
+        self.model.to("cuda:0")
+        mm_use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(self.model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            self.tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            self.tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        self.model.resize_token_embeddings(len(self.tokenizer))
+        vision_tower = self.model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        self.image_processor = vision_tower.image_processor
+        self.conv_mode = conv_mode
+        print(self.model)
+    def get_prompt(self, qs, state):
+        state.append_message(state.roles[0], qs)
+        state.append_message(state.roles[1], None)
+        return state
+    def _get_rawvideo_dec(self, video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224,
+                          video_framerate=1, s=None, e=None):
+        if s is None:
+            start_time, end_time = None, None
+        else:
+            start_time = int(s)
+            end_time = int(e)
+            start_time = start_time if start_time >= 0. else 0.
+            end_time = end_time if end_time >= 0. else 0.
+            if start_time > end_time:
+                start_time, end_time = end_time, start_time
+            elif start_time == end_time:
+                end_time = start_time + 1
+        if os.path.exists(video_path):
+            vreader = VideoReader(video_path, ctx=cpu(0))
+        else:
+            print(video_path)
+            raise FileNotFoundError
+        fps = vreader.get_avg_fps()
+        f_start = 0 if start_time is None else int(start_time * fps)
+        f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+        num_frames = f_end - f_start + 1
+        if num_frames > 0:
+            sample_fps = int(video_framerate)
+            t_stride = int(round(float(fps) / sample_fps))
+            all_pos = list(range(f_start, f_end + 1, t_stride))
+            if len(all_pos) > max_frames:
+                sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+            else:
+                sample_pos = all_pos
+            patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+            return patch_images
+    @torch.inference_mode()
+    @spaces.GPU
+    def generate(self, images_tensor: list, prompt: str, first_run: bool, state):
+        tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
+        state = self.get_prompt(prompt, state)
+        prompt = state.get_prompt()
+        print(prompt)
+        images_tensor = torch.stack(images_tensor, dim=0)
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to("cuda:0")
+        temperature = 0.2
+        max_new_tokens = 1024
+        stop_str = conv_templates[self.conv_mode].copy().sep if conv_templates[self.conv_mode].copy().sep_style != SeparatorStyle.TWO else \
+        conv_templates[self.conv_mode].copy().sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images_tensor,
+                do_sample=True,
+                temperature=temperature,
+                num_beams=1,
+                max_new_tokens=max_new_tokens,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        print('response', outputs)
+        return outputs, state
+title_markdown = ("""
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <a href="https://github.com/PKU-YuanGroup/Chat-UniVi" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://z1.ax1x.com/2023/11/22/pidlXh4.jpg" alt="Chat-UniVi🚀" style="max-width: 120px; height: auto;">
+  </a>
+  <div>
+    <h1 >Chat-UniVi: Unified Visual Representation Empowers Large Language Models with Image and Video Understanding</h1>
+    <h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
+  </div>
+</div>
+<div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        <a href='https://github.com/PKU-YuanGroup/Chat-UniVi'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        <a href="https://arxiv.org/pdf/2311.08046.pdf"><img src="https://img.shields.io/badge/Arxiv-2311.08046-red"></a>
+        <a href='https://github.com/PKU-YuanGroup/Chat-UniVi/stargazers'><img src='https://img.shields.io/github/stars/PKU-YuanGroup/Chat-UniVi.svg?style=social'></a>
+    </div>
+</div>
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+"""
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")

ChatUniVi/eval/evaluate/evaluate_benchmark_1_correctness.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
+                            "- The predicted answer must be factually accurate and align with the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the factual accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for correctness:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_1_correctness_mi.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
+                            "- The predicted answer must be factually accurate and align with the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the factual accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    openai.api_base = 'http://10.234.32.86:8000/lumi-proxy'
+    openai.api_key = "Don't Panic"
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for correctness:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_2_detailed_orientation.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for detailed orientation.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the detailed-orientation score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
+                            "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for detailed orientation:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_2_detailed_orientation_mi.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for detailed orientation.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the detailed-orientation score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
+                            "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    openai.api_base = 'http://10.234.32.86:8000/lumi-proxy'
+    openai.api_key = "Don't Panic"
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for detailed orientation:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_3_context.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for contextual understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the contextual understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
+                            "- The predicted answer must capture the main themes and sentiments of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for contextual understanding:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_3_context_mi.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for contextual understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the contextual understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
+                            "- The predicted answer must capture the main themes and sentiments of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    openai.api_base = 'http://10.234.32.86:8000/lumi-proxy'
+    openai.api_key = "Don't Panic"
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for contextual understanding:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_4_temporal.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for temporal understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the temporal understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
+                            "- Evaluate the temporal accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score temporal understanding:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_4_temporal_mi.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for temporal understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the temporal understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
+                            "- Evaluate the temporal accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    openai.api_base = 'http://10.234.32.86:8000/lumi-proxy'
+    openai.api_key = "Don't Panic"
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score temporal understanding:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_5_consistency.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for consistency.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question1 = qa_set['q1']
+        question2 = qa_set['q2']
+        answer = qa_set['a']
+        pred1 = qa_set['pred1']
+        pred2 = qa_set['pred2']
+        try:
+            # Compute the consistency score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
+                            "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
+                            "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
+                            "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
+                            "- Evaluate the consistency of the two predicted answers compared to the correct answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question 1: {question1}\n"
+                            f"Question 2: {question2}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer to Question 1: {pred1}\n"
+                            f"Predicted Answer to Question 2: {pred2}\n\n"
+                            "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ],
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        # video_id = sample['video_name']
+        video_id = 1
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question1 = sample['prompt_1']
+        question2 = sample['prompt_2']
+        answer = sample['answer']
+        pred1 = sample['text_1']
+        pred2 = sample['text_2']
+        qa_set = {"q1": question1, "q2": question2, "a": answer, "pred1": pred1, "pred2": pred2}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for consistency:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_benchmark_5_consistency_mi.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for consistency.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question1 = qa_set['q1']
+        question2 = qa_set['q2']
+        answer = qa_set['a']
+        pred1 = qa_set['pred1']
+        pred2 = qa_set['pred2']
+        try:
+            # Compute the consistency score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
+                            "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
+                            "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
+                            "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
+                            "- Evaluate the consistency of the two predicted answers compared to the correct answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question 1: {question1}\n"
+                            f"Question 2: {question2}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer to Question 1: {pred1}\n"
+                            f"Predicted Answer to Question 2: {pred2}\n\n"
+                            "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ],
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        # video_id = sample['video_name']
+        video_id = 1
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question1 = sample['prompt_1']
+        question2 = sample['prompt_2']
+        answer = sample['answer']
+        pred1 = sample['text_1']
+        pred2 = sample['text_2']
+        qa_set = {"q1": question1, "q2": question2, "a": answer, "pred1": pred1, "pred2": pred2}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    openai.api_base = 'http://10.234.32.86:8000/lumi-proxy'
+    openai.api_key = "Don't Panic"
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for consistency:", average_score * 20)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_gpt_review_visual.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import argparse
+import json
+import os
+import requests
+import openai
+import time
+NUM_SECONDS_TO_SLEEP = 0.5
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',  # gpt-4-0314
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    return response['choices'][0]['message']['content']
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+    review_file = open(f'{args.output}', 'a')
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()

ChatUniVi/eval/evaluate/evaluate_science_qa.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import argparse
+import json
+import os
+import re
+import random
+import numpy as np
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+if __name__ == "__main__":
+    args = get_args()
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+    sqa_results['NAT'] = []
+    sqa_results['SOC'] = []
+    sqa_results['LAN'] = []
+    sqa_results['TXT'] = []
+    sqa_results['IMG'] = []
+    sqa_results['NO'] = []
+    sqa_results['G1-6'] = []
+    sqa_results['G7-12'] = []
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            continue
+        pred = predictions[prob_id]
+        pred_text = pred['text']
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        res = pattern.findall(pred_text)
+        if len(res) == 1:
+            answer = res[0]  # 'A', 'B', ...
+        else:
+            answer = pred['pred']
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+            cur_result = 1
+        else:
+            results['incorrect'].append(analysis)
+            cur_result = 0
+        if prob['subject'] == 'natural science':
+            sqa_results['NAT'].append(cur_result)
+        elif prob['subject'] == 'social science':
+            sqa_results['SOC'].append(cur_result)
+        elif prob['subject'] == 'language science':
+            sqa_results['LAN'].append(cur_result)
+        if prob['hint']:
+            sqa_results['TXT'].append(cur_result)
+        if prob['image']:
+            sqa_results['IMG'].append(cur_result)
+        if not prob['hint'] and not prob['image']:
+            sqa_results['NO'].append(cur_result)
+        if prob['grade'] in ['grade1', 'grade2', 'grade3', 'grade4', 'grade5', 'grade6']:
+            sqa_results['G1-6'].append(cur_result)
+        elif prob['grade'] in ['grade7', 'grade8', 'grade9', 'grade10', 'grade11', 'grade12']:
+            sqa_results['G7-12'].append(cur_result)
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Subject NAT: {len(sqa_results["NAT"])}, Correct: {sum(sqa_results["NAT"])}, Accuracy: {np.mean(sqa_results["NAT"]) * 100:.2f}%')
+    print(f'Subject SOC: {len(sqa_results["SOC"])}, Correct: {sum(sqa_results["SOC"])}, Accuracy: {np.mean(sqa_results["SOC"]) * 100:.2f}%')
+    print(f'Subject LAN: {len(sqa_results["LAN"])}, Correct: {sum(sqa_results["LAN"])}, Accuracy: {np.mean(sqa_results["LAN"]) * 100:.2f}%')
+    print(f'Context Modality TXT: {len(sqa_results["TXT"])}, Correct: {sum(sqa_results["TXT"])}, Accuracy: {np.mean(sqa_results["TXT"]) * 100:.2f}%')
+    print(f'Context Modality IMG: {len(sqa_results["IMG"])}, Correct: {sum(sqa_results["IMG"])}, Accuracy: {np.mean(sqa_results["IMG"]) * 100:.2f}%')
+    print(f'Context Modality NO: {len(sqa_results["NO"])}, Correct: {sum(sqa_results["NO"])}, Accuracy: {np.mean(sqa_results["NO"]) * 100:.2f}%')
+    print(f'Grade G1-6: {len(sqa_results["G1-6"])}, Correct: {sum(sqa_results["G1-6"])}, Accuracy: {np.mean(sqa_results["G1-6"]) * 100:.2f}%')
+    print(f'Grade G7-12: {len(sqa_results["G7-12"])}, Correct: {sum(sqa_results["G7-12"])}, Accuracy: {np.mean(sqa_results["G7-12"]) * 100:.2f}%')
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)

ChatUniVi/eval/evaluate/evaluate_video_qa.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the correctness of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score and accuracy
+    score_sum = 0
+    count = 0
+    yes_count = 0
+    no_count = 0
+    for key, result in combined_contents.items():
+        # Computing score
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+        # Computing accuracy
+        pred = result[0]['pred']
+        if "yes" in pred.lower():
+            yes_count += 1
+        elif "no" in pred.lower():
+            no_count += 1
+    average_score = score_sum / count
+    accuracy = yes_count / (yes_count + no_count)
+    print("Yes count:", yes_count)
+    print("No count:", no_count)
+    print("Accuracy:", accuracy)
+    print("Average score:", average_score)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/evaluate_video_qa_mi.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import openai
+import os
+import argparse
+import json
+import jsonlines
+import ast
+from multiprocessing.pool import Pool
+def read_jsonl(file):
+    results = []
+    with open(file, encoding='utf-8') as f:
+        for item in jsonlines.Reader(f):
+            results.append(item)
+    return results
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the correctness of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = args.pred_path
+    try:
+        pred_contents = json.load(file)
+    except:
+        pred_contents = read_jsonl(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        sample['video_name'] = 1
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['prompt']
+        answer = sample['answer']
+        pred = sample['text']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    openai.api_base = 'http://10.234.32.86:8000/lumi-proxy'
+    openai.api_key = "Don't Panic"
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score and accuracy
+    score_sum = 0
+    count = 0
+    yes_count = 0
+    no_count = 0
+    for key, result in combined_contents.items():
+        # Computing score
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+        # Computing accuracy
+        pred = result[0]['pred']
+        if "yes" in pred.lower():
+            yes_count += 1
+        elif "no" in pred.lower():
+            no_count += 1
+    average_score = score_sum / count
+    accuracy = yes_count / (yes_count + no_count)
+    print("Yes count:", yes_count)
+    print("No count:", no_count)
+    print("Accuracy:", accuracy)
+    print("Average score:", average_score)
+if __name__ == "__main__":
+    main()

ChatUniVi/eval/evaluate/summarize_gpt_review.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import json
+import os
+from collections import defaultdict
+import numpy as np
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=None)
+    parser.add_argument('-i', '--ignore', nargs='*', default=None)
+    return parser.parse_args()
+if __name__ == '__main__0':
+    args = parse_args()
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+    if args.files is not None and len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_'))]
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if args.ignore is not None and review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, stats, round(stats[1]/stats[0]*100, 1))
+            print(k, round(stats[1]/stats[0]*100, 1))
+        print('=================================')
+if __name__ == '__main__':
+    args = parse_args()
+    scores = defaultdict(list)
+    print(f'GPT-4 vs. {args.dir}')
+    with open(args.dir) as f:
+        for review_str in f:
+            review = json.loads(review_str)
+            scores[review['category']].append(review['tuple'])
+            scores['all'].append(review['tuple'])
+    for k, v in scores.items():
+        stats = np.asarray(v).mean(0).tolist()
+        stats = [round(x, 3) for x in stats]
+        print(k, stats, round(stats[1]/stats[0]*100, 1))
+    print('=================================')

ChatUniVi/eval/model_coco_vqa.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from abc import ABC
+import numpy as np
+import jsonlines
+def get_acc(file):
+    acc, num = 0, 0
+    yes, no, fail = 0, 0, 0
+    tp, fp, fn, tn = 0, 0, 0, 0
+    with open(file, "r", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            num += 1
+            if "Yes" in item["text"] or "yes" in item["text"]:
+                yes += 1
+                if "Yes" in item["label"] or "yes" in item["label"]:
+                    acc += 1
+                    tp += 1
+                else:
+                    fp += 1
+            elif "No" in item["text"] or "no" in item["text"]:
+                no += 1
+                if "No" in item["label"] or "no" in item["label"]:
+                    acc += 1
+                    tn += 1
+                else:
+                    fn += 1
+            else:
+                fail += 1
+    result = {
+        "acc": acc / num,
+        "yes": yes / num,
+        "no": no / num,
+        "fail": fail / num,
+        "precision": tp / (tp + fp),
+        "recall": tp / (tp + fn),
+    }
+    result["F1-score"] = 2 * result["precision"] * result["recall"] / (result["precision"] + result["recall"])
+    print("\n========================================================================")
+    print(file)
+    print(result)
+    print("========================================================================\n")
+    return result
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+class LogitsProcessor(ABC):
+    """Abstract base class for all logit processors that can be applied during generation."""
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        """Torch method for processing logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        try:
+            idx = line["question_id"]
+            image_file = line["image"]
+            qs = line["text"]
+            label = line["label"]
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            if args.answer_prompter:
+                with torch.inference_mode():
+                    output_ids = model.generate(
+                        input_ids,
+                        images=image_tensor.unsqueeze(0).half().cuda(),
+                        do_sample=True,
+                        temperature=args.temperature,
+                        top_p=args.top_p,
+                        num_beams=args.num_beams,
+                        max_new_tokens=1024,
+                        use_cache=True,
+                        stopping_criteria=[stopping_criteria]
+                    )
+                input_token_len = input_ids.shape[1]
+                n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+                if n_diff_input_output > 0:
+                    print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+                outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+                outputs = outputs.strip()
+                if outputs.endswith(stop_str):
+                    outputs = outputs[:-len(stop_str)]
+                outputs = outputs.strip()
+                outputs_reasoning = outputs
+                input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' The answer is ', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+            else:
+                outputs_reasoning = ""
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    do_sample=True,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria]
+                    )
+            scores = output_ids.scores[0][0].to(torch.float32)
+            label_score = []
+            candidates = ["yes", "Yes", "no", "No"]
+            for can in candidates:
+                can_id = tokenizer.encode(can)[-1]
+                label_score.append(scores[can_id].item())
+            outputs = candidates[np.argmax(label_score)]
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "outputs_reasoning": outputs_reasoning + ' The answer is ' + outputs,
+                                   "text": outputs,
+                                   "label": label,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+            ans_file.flush()
+        except Exception as e:
+            print(f"Error processing image file '{image_file}': {e}")
+    ans_file.close()
+    get_acc(answers_file)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="simpleqa")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    parser.add_argument("--answer-prompter", action="store_true")
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_video_consistency.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, video_mask
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    if model.config.config["use_cluster"]:
+        for n, m in model.named_modules():
+            m = m.to(dtype=torch.bfloat16)
+    # Load the ground truth file
+    with open(args.question_file) as file:
+        gt_contents = json.load(file)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    # Iterate over each sample in the ground truth file
+    for sample in tqdm(gt_contents):
+        video_name = sample['video_name']
+        question_1 = sample['Q1']
+        question_2 = sample['Q2']
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if video_path is not None:  # Modified this line
+            video_frames, _ = _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        qs = question_1
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+                0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=video_frames.half().cuda(),
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs_1 = outputs.strip()
+        qs = question_2
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+            0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=torch.from_numpy(video_frames).half().cuda(),
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs_2 = outputs.strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"video_name": sample['video_name'],
+                                    "prompt_1": question_1,
+                                    "text_1": outputs_1,
+                                    "prompt_2": question_2,
+                                    "text_2": outputs_2,
+                                    "answer_id": ans_id,
+                                    "model_id": model_name,
+                                    "answer": sample['A'],
+                                    "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--video-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_video_general.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, video_mask
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    if model.config.config["use_cluster"]:
+        for n, m in model.named_modules():
+            m = m.to(dtype=torch.bfloat16)
+    # Load the ground truth file
+    with open(args.question_file) as file:
+        gt_contents = json.load(file)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    # Iterate over each sample in the ground truth file
+    for sample in tqdm(gt_contents):
+        video_name = sample['video_name']
+        sample_set = sample
+        qs = sample['Q']
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if video_path is not None:  # Modified this line
+            video_frames, _ = _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        try:
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + '\n' + qs
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+                0).cuda()
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=video_frames.half().cuda(),
+                    do_sample=True,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria])
+            input_token_len = input_ids.shape[1]
+            n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+            if n_diff_input_output > 0:
+                print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+            outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            if outputs.endswith(stop_str):
+                outputs = outputs[:-len(stop_str)]
+            outputs = outputs.strip()
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({'video_name': sample['video_name'],
+                                       "prompt": cur_prompt,
+                                       "text": outputs,
+                                       "answer_id": ans_id,
+                                       "model_id": model_name,
+                                       "answer": sample['A'],
+                                       "metadata": {}}) + "\n")
+            ans_file.flush()
+        except Exception as e:
+            print(f"Error processing video file '{video_name}': {e}")
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--video-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_video_qa.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+def read_json(file):
+    with open(file, "r", encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = torch.stack([image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images])
+        slice_len = patch_images.shape[0]
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            video[:slice_len, ...] = patch_images
+        return patch_images, video_mask
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return torch.from_numpy(video), video_mask
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    if model.config.config["use_cluster"]:
+        for n, m in model.named_modules():
+            m = m.to(dtype=torch.bfloat16)
+    # Load the ground truth file
+    with open(args.question_file) as file:
+        gt_contents = json.load(file)
+    answers_list = read_json(args.answers_list)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    video_formats = ['.mp4', '.avi', '.mov', '.mkv']
+    # Iterate over each sample in the ground truth file
+    for sample in tqdm(gt_contents):
+        sample_set = sample
+        qs = sample['question']
+        # Load the video file
+        for fmt in video_formats:  # Added this line
+            video_name = sample['video_name']
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+            video_name = "v_" + sample['video_name']
+            temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
+            if os.path.exists(temp_path):
+                video_path = temp_path
+                break
+        # Check if the video exists
+        if video_path is not None:  # Modified this line
+            if args.max_frames:
+                video_frames, _ = _get_rawvideo_dec(video_path, image_processor, max_frames=args.max_frames)
+            else:
+                video_frames, _ = _get_rawvideo_dec(video_path, image_processor, max_frames=MAX_IMAGE_LENGTH)
+        try:
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH + '\n' + qs
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
+                0).cuda()
+            stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+            keywords = [stop_str]
+            stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=video_frames.half().cuda(),
+                    do_sample=True,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    output_scores=True,
+                    return_dict_in_generate=True,
+                    max_new_tokens=1024,
+                    use_cache=True,
+                    stopping_criteria=[stopping_criteria])
+            output_ids = output_ids.sequences
+            input_token_len = input_ids.shape[1]
+            n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+            if n_diff_input_output > 0:
+                print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+            outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+            outputs = outputs.strip()
+            if outputs.endswith(stop_str):
+                outputs = outputs[:-len(stop_str)]
+            outputs = outputs.strip()
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"video_name": sample['video_name'],
+                                       "prompt": cur_prompt,
+                                       "text": outputs,
+                                       "answer_id": ans_id,
+                                       "model_id": model_name,
+                                       "answer": sample['answer'],
+                                       "metadata": {}}) + "\n")
+            ans_file.flush()
+        except Exception as e:
+            print(f"Error processing video file '{video_name}': {e}")
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--video-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-list", type=str, default="tables/answers_list.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_frames", type=int, default=None)
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_vqa.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import *
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line["image"]
+        qs = line["text"]
+        cur_prompt = qs
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        image = Image.open(os.path.join(args.image_folder, image_file))
+        image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.unsqueeze(0).half().cuda(),
+                do_sample=True,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="simple")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--model_use", type=str, default="BASE")
+    parser.add_argument("--max_new_tokens", type=int, default=1024)
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/model_vqa_scienceqa.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+from ChatUniVi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ChatUniVi.conversation import conv_templates, SeparatorStyle
+from ChatUniVi.model.builder import load_pretrained_model
+from ChatUniVi.utils import disable_torch_init
+from ChatUniVi.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+from PIL import Image
+import math
+import numpy as np
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = "ChatUniVi"
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for i, line in enumerate(tqdm(questions)):
+        idx = line["id"]
+        question = line['conversations'][0]
+        gt_ans = line["conversations"][1]
+        qs = question['value'].replace('<image>', '').strip()
+        cur_prompt = qs
+        if 'image' in line:
+            image_file = line["image"].replace("\\", "/")
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            images = image_tensor.unsqueeze(0).half().cuda()
+            if getattr(model.config, 'mm_use_im_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            cur_prompt = '<image>' + '\n' + cur_prompt
+        else:
+            images = None
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                do_sample=True,
+                temperature=0.2,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        outputs_reasoning = outputs
+        input_ids = tokenizer_image_token(prompt + outputs_reasoning + ' ###\nANSWER:', tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                do_sample=True,
+                temperature=0.2,
+                max_new_tokens=64,
+                use_cache=True,
+                output_scores=True,
+                return_dict_in_generate=True,
+                stopping_criteria=[stopping_criteria])
+        scores = output_ids.scores[0][0].to(torch.float32)
+        label_score = []
+        candidates = []
+        answers_list = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+        for i in answers_list:
+            if "(" + i + ")" in cur_prompt:
+                candidates.append(i)
+        for can in candidates:
+            can_id = tokenizer.encode(can)[-1]
+            label_score.append(scores[can_id].item())
+        outputs_answer = candidates[np.argmax(label_score)]
+        output_ids = output_ids.sequences
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        outputs = outputs_reasoning + '\n The answer is ' + outputs
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "pred": outputs_answer,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="simple")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    args = parser.parse_args()
+    eval_model(args)

ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_gpt4_answer.jsonl ADDED Viewed

	@@ -0,0 +1,90 @@

+{"question_id": 0, "text": "The colors of the two suitcases in the image are black and brown with yellow details.", "category": "conv"}
+{"question_id": 1, "text": "The image features two antique suitcases made of leather, stacked one on top of the other. The top suitcase is black, while the bottom one has a combination of brown and yellow colors. Both suitcases are prominently displayed side by side, taking up a large portion of the image. They appear to be vintage luggage, possibly available for purchase with price tags attached, showcasing their classic design and craftsmanship.", "category": "detail"}
+{"question_id": 2, "text": "Several factors could make the two stacked suitcases valuable. In the image, they are described as antique, which suggests that they are potentially rare or have some historical significance. Their age and rarity could contribute to their value. Additionally, they are made of leather, which is a material known for its durability and, in some cases, luxury. The suitcases also differ in color, featuring black, brown, and yellow tones, which may attract buyers due to their aesthetic appeal or the desire to own a unique piece of luggage. Furthermore, the presence of price tags on the luggage indicates that they are being sold, drawing attention to their perceived value in the market. Collectors or enthusiasts of vintage items may find these suitcases valuable due to their material, design, age, and distinct appearance.", "category": "complex"}
+{"question_id": 3, "text": "The main objects on the table in the image are plates, glasses, eating utensils, and possibly a bottle of something.", "category": "conv"}
+{"question_id": 4, "text": "The image showcases a dining table filled with various dirty dishes, eating utensils, and a bottle. The table has plates and glasses scattered across it. There are at least two cups on the table, with one positioned on the left and another on the right side. \n\nTwo forks can be seen, one resting on a small white plate and the other located towards the left side of the table. A spoon is also present, placed near the left corner of the table. Additionally, there is a bottle located in the upper central area of the table.", "category": "detail"}
+{"question_id": 5, "text": "A meal or gathering might have recently taken place around this table, as evidenced by the variety of dirty dishes, utensils, and drinkware present on it. The aftermath of this meal or event could involve the need to clean up and wash the dirty dishes, glasses, and eating utensils. Items on the table include plates, forks, spoons, cups, and a bottle, which are typically used during such occasions. Participants might have engaged in conversation, shared food and beverages, and enjoyed each other's company at this table. Once the gathering is complete, disposing of any leftovers and cleaning the table area would be necessary to prepare it for the next use.", "category": "complex"}
+{"question_id": 6, "text": "The main focus of the image is a cat or kitten sitting on or next to a laptop computer.", "category": "conv"}
+{"question_id": 7, "text": "The image features a cat sitting on a wooden desk, intently staring at a black laptop computer that is open in front of it. The desk is quite organized, with several books spread around, both on the left and right sides of the laptop. Some books are neatly piled and others are laid out individually. The scene could be interpreted as the cat appearing to use the laptop or showing interest in what's on the screen.", "category": "detail"}
+{"question_id": 8, "text": "The cat may be interested in the laptop due to the light coming from the screen, the movement of the cursor, or the sounds produced by the laptop, such as keystrokes or notifications. Cats are known to be naturally curious creatures and are often attracted to objects that stimulate their senses. The visual display, warmth emanating from the device, and interactive nature of the laptop could be appealing to the cat, prompting it to sit on the desk and face the open laptop. Additionally, the cat may simply be seeking its owner's attention or wanting to be close to them while they use the laptop.", "category": "complex"}
+{"question_id": 9, "text": "The dog in the image is black.", "category": "conv"}
+{"question_id": 10, "text": "The image features a large black dog with a red collar laying down on a kitchen floor. The tiled floor provides a comfortable space for the dog to relax. The dog is positioned towards the middle of the scene. Next to the dog, there are two bowls placed on the floor, one around the top-left side and the other slightly further to the right. These could be the dog's food and water bowls. The scene depicts a cozy domestic environment for the dog.", "category": "detail"}
+{"question_id": 11, "text": "The dog may have chosen to lay down on the kitchen floor in this location for several reasons. First, the tiled floor can provide a cool surface, which could be comforting compared to a warm or soft surface. This is particularly true if the dog has a thick coat or if the temperature in the room is high. Second, the dog is positioned next to its bowls and a toy, which suggests that the location is familiar and associated with food, water, and playtime. The dog could be resting near these resources for convenience and ease of access. Lastly, the location could be a central spot in the home, and dogs often prefer to stay near their owners or members of the household. By laying on the kitchen floor, the dog might be able to keep an eye on the activities happening around it and remain close to its family.", "category": "complex"}
+{"question_id": 12, "text": "The cart with luggage bags is located in a lobby.", "category": "conv"}
+{"question_id": 13, "text": "The scene is set in a lobby at a commercial business, where a wheeled luggage cart is filled with various suitcases and bags. There are a number of luggage bags, including two backpacks, one near the front of the cart and another near the back, as well as multiple suitcases placed on different sections of the cart. \n\nA handbag is also visible, placed near the front of the cart. Two people are present in the scene: one person standing near the left side and another person on the right side, possibly engaging in conversation or waiting to transport their luggage to their rooms.", "category": "detail"}
+{"question_id": 14, "text": "In the image, a cart is filled with numerous pieces of luggage, including suitcases, bags, and backpacks, in the lobby of a commercial business. The staff responsible for handling the luggage might face several challenges such as:\n\n1. Efficiently organizing and placing the luggage on the cart to prevent any damages or items from falling off during transportation.\n2. Identifying and keeping track of each guest's belongings to avoid misplacement or mixing up the luggage.\n3. Maneuvering the heavily loaded cart without causing hindrances to other guests in the lobby while ensuring their safety.\n4. Carrying or transporting the luggage to the guests' rooms in a timely manner to maintain customer satisfaction and efficient use of staff resources.\n\nTo overcome these challenges, staff might need proper training in luggage handling, organization, and customer service skills. Maintaining clear luggage tags and taking note of each item could help them ensure accurate delivery to the corresponding guests' rooms.", "category": "complex"}
+{"question_id": 15, "text": "The image features a large, placid lake.", "category": "conv"}
+{"question_id": 16, "text": "The image depicts a serene and picturesque landscape with a large, placid lake surrounded by beautiful tree-covered mountains. Cliffs rise majestically on the edge of the lake, showcasing the grandeur of nature. \n\nIn the distance, near the waterfront, a train can be seen traveling on a track. The train adds a touch of human presence to the otherwise untouched landscape and further enhances the scenic view.", "category": "detail"}
+{"question_id": 17, "text": "There could be several reasons for the train to be traveling on this scenic route surrounded by mountains, water, and trees. First, the train route might have been built to provide transportation services to local communities or businesses located in this remote or picturesque area. It may facilitate travel, trade, or tourism in the region. Second, the scenic route could be intentionally designed to attract tourists who seek leisure train journeys that showcase natural beauty and captivating landscapes. These types of train rides offer a unique and immersive experience for people who enjoy exploring breathtaking scenery and discovering new destinations while traveling comfortably. Ultimately, the train route could serve both practical and recreational purposes, fostering connectivity and promoting tourism for the area.", "category": "complex"}
+{"question_id": 18, "text": "The main object in the image is a colorful striped umbrella standing on the beach.", "category": "conv"}
+{"question_id": 19, "text": "The image features a colorful striped umbrella on the beach, with its vibrant appearance attracting attention. The umbrella is firmly positioned in the sand, providing shade and comfort for visitors. The umbrella sits near the ocean, offering a lovely view of the water and contributing to the pleasant beach atmosphere.", "category": "detail"}
+{"question_id": 20, "text": "The colorful umbrella set up on the beach serves several purposes. First, it provides shade from the sun, which helps protect people from direct sunlight and the associated harmful UV rays, reducing the risk of sunburn and heatstroke. Second, it offers a comfortable spot to relax, rest, and enjoy the beach ambiance without being exposed to the intense sun or heat. Finally, the umbrella creates a designated area for beach-goers to keep their belongings and have a visible landmark to locate their spot quickly when in the crowded beach environment.", "category": "complex"}
+{"question_id": 21, "text": "The house is in the beginning stages of its construction, with the wooden structure, studs, and staircase partially built but not yet completed.", "category": "conv"}
+{"question_id": 22, "text": "The image shows the beginning stages of a house under construction. The wooden structure is still being built, and there are lots of wooden studs visible throughout the scene, indicating work in progress. Additionally, a partially-built staircase can be seen within the house, giving an idea of the floor plan layout.\n\nDespite the house still being under construction, a bed and a TV are already present in the area. The bed is located near the center of the image, with the TV placed to its left. This shows that some household items have been brought in, probably in anticipation of the house's completion.", "category": "detail"}
+{"question_id": 23, "text": "In the current state of the house construction with wooden studs and a partially-built staircase, a potential obstacle that might occur is the incomplete structure being exposed to external elements like rain, snow, or strong winds. These conditions can potentially damage the building materials, delay construction work, and cause additional unexpected costs for repairing or replacing materials. It is essential for the builders to properly cover the construction site and store materials accordingly to protect them from potential damage when not in use.", "category": "complex"}
+{"question_id": 24, "text": "There are three pizzas in the image.", "category": "conv"}
+{"question_id": 25, "text": "The image displays three open pizza boxes with pizzas inside. The first pizza box, situated to the left, contains a smaller pizza. The second pizza box, positioned in the middle, features a larger pizza topped with tortilla chips. The third pizza box, located to the right, contains another smaller pizza. \n\nBeside the large pizza, there are two side orders of sauce in bowls, accompanying the delicious meal. The arrangement showcases a delicious spread of pizzas, inviting everyone to enjoy their different flavors and sizes.", "category": "detail"}
+{"question_id": 26, "text": "When dividing these pizzas equitably among a group of people, one might face several challenges due to the different sizes and potential toppings on the pizzas. In the image, there are three pizzas sitting in their delivery boxes, with one pizza being larger than the other two. This size difference may complicate equal distribution of pizza slices among the group, as some people might want more of the larger pizza, while others may prefer the smaller ones.\n\nFurthermore, one of the pizzas is topped with tortilla chips, which could lead to preference differences among the group, as some people might like this topping while others may not. Lastly, alongside one of the pizzas, there are two side orders of sauce, which may need to be shared among those who enjoy sauce with their pizza. All these factors make it a challenge to ensure that everyone in the group has an equal and satisfying share of the pizzas and accompanying sauces. One possible solution is to have a group discussion and decide on rules or a plan for dividing the pizzas, taking into account individual preferences and dietary restrictions if any.", "category": "complex"}
+{"question_id": 27, "text": "There are four doughnuts in the box.", "category": "conv"}
+{"question_id": 28, "text": "The image features a box filled with four different kinds of doughnuts sitting on a table. Each doughnut appears to have a unique design or topping. One of the doughnuts is a cake-style doughnut, while another has nuts and coconut on top. The third doughnut appears to be a glazed one, and the last doughnut also has various toppings. The doughnuts are positioned inside the box in close proximity to one another, showcasing their delicious variety.", "category": "detail"}
+{"question_id": 29, "text": "The box contains a variety of doughnuts with unique taste combinations. Among the four doughnuts, there are cake doughnuts, a glazed doughnut, and one with a mix of nuts and coconut as toppings. These different toppings and textures provide a diverse selection for those who want to try various flavors and types of doughnuts. Combining the flavors of nuts and coconut on one doughnut, in particular, offers a unique taste experience that blends the richness of nuts with the tropical sweetness of coconut. Thus, the box offers a selection that caters to different preferences and invites people to discover new taste combinations.", "category": "complex"}
+{"question_id": 30, "text": "In the image, the man is holding up clothes on hangers, possibly in some hanger bags, and appears to be preparing for travel, as a suitcase is also in front of him.", "category": "conv"}
+{"question_id": 31, "text": "The image depicts a man standing near a window, holding two clothing hangers with garments in both hands, one red and one white. In front of him, there is a suitcase, possibly indicating that he is preparing for travel or organizing his belongings. The man appears to be showing or comparing these clothes while packing his luggage.", "category": "detail"}
+{"question_id": 32, "text": "The man might be holding the clothes on hangers because he is preparing to pack his suitcase for a trip or has just arrived at his destination and is unpacking his belongings. By holding the clothes on hangers, he can avoid wrinkles and ensure that the garments remain in good condition for wearing during his journey or at his destination. The presence of a suitcase in front of him supports this idea, as it is commonly used for transportation and storage of personal belongings when traveling.", "category": "complex"}
+{"question_id": 33, "text": "There are four giraffes in the image.", "category": "conv"}
+{"question_id": 34, "text": "The image showcases a group of four giraffes standing in their natural habitat, a grassy field with some dry brush. They are dispersed across the field, with one giraffe positioned more towards the left, and the other three clustered closer together on the right side. The giraffes appear to be grazing peacefully or engaging in social interactions. A tree can be seen in the scene, providing shade and a point of interest for the giraffes to gather around.", "category": "detail"}
+{"question_id": 35, "text": "In the image, there are four giraffes gathered near the same tree in their natural habitat, a grass field with dry brush. Giraffes may gather near the same tree for various reasons. One common reason is to feed, as their long necks enable them to reach high branches for fresh leaves and foliage, which is a primary part of their diet. Giraffes are also social animals, so they may gather around the same tree to interact with each other and maintain their social bonds. Furthermore, the tree can provide shade in the hot African sun, offering a place for the giraffes to rest and regulate their body temperature.", "category": "complex"}
+{"question_id": 36, "text": "The main focus of the image is the head of a giraffe, shown in a close-up headshot.", "category": "conv"}
+{"question_id": 37, "text": "The image is a clear and colorful close up of a giraffe's head, showcasing its beautiful features. The giraffe is standing tall among several bright green trees, likely in its natural habitat. It appears to be chewing on some leaves, while its face is turned to the side, providing a detailed view of its profile.", "category": "detail"}
+{"question_id": 38, "text": "From the image, it can be inferred that the giraffe's habitat is likely a natural environment with an abundance of greenery. The giraffe is standing tall among several bright green trees, which suggests that the area is rich in vegetation and provides plenty of food sources for the giraffe, such as leaves. This natural habitat is likely suitable for giraffes to thrive and maintain their unique characteristics, such as their long necks, which are adapted for reaching leaves on tall trees.", "category": "complex"}
+{"question_id": 39, "text": "The art installation includes a display of large artworks, such as an oversized bench, painted feline heads, and potentially other artistic sculptures or images.", "category": "conv"}
+{"question_id": 40, "text": "The image features an art installation placed on a city street. The display consists of a large, oversized wooden bench and two massive animal art sculptures, which appear to be painted feline heads. \n\nThere are several people in the scene, with two people sitting near the art display, one person standing nearby, and another person further to the left. A parked car is visible behind the bench, close to the edge of the street. Additionally, a backpack can be seen on the ground, closer to the left side of the bench.", "category": "detail"}
+{"question_id": 41, "text": "The art installation, featuring an oversized wooden bench and two massive animal art sculptures, encourages interaction among people by providing a visually intriguing and inviting space to sit or gather. People are naturally drawn to the unique and whimsical nature of these installations, sparking curiosity and conversation among those who encounter them. The oversized bench provides seating for multiple individuals, prompting people to sit down, rest, or socialize while admiring the giant painted feline heads and other artworks on display. Additionally, the installations capture the attention of passersby, creating a shared experience and a sense of community among those who visit the area. The artistic environment stimulates discussion and appreciation of the art, facilitating social interactions and connections among those who gather near the sculptures and oversized bench.", "category": "complex"}
+{"question_id": 42, "text": "The main focus of the image is a car parked on the side of the road, next to a parking meter.", "category": "conv"}
+{"question_id": 43, "text": "The scene features a black car parked on the side of the road next to a parking meter. There is another parking meter slightly further ahead of the car. The car is parked in front of a building, which seems to be the destination for the driver. \n\nAdditionally, there are other vehicles in the image. A gray car is situated further behind the black car, and a truck can be seen in the background on the right side. Moreover, there is another vehicle barely visible on the left side of the scene.", "category": "detail"}
+{"question_id": 44, "text": "The most plausible reason for the car to be parked on the side of the road is that it is utilizing the available parking spot with parking meters. In the image, the car is parked next to two parking meters, which indicates that the driver has parked the car to follow parking regulations and likely paid for their parking session. The car might be parked there so the driver or passengers can go to a nearby building, shop, or participate in any other activities in the area. Parking meters are commonly used in urban and commercial areas to manage and regulate limited street parking spaces, and it's reasonable to assume the driver is abiding by these parking rules.", "category": "complex"}
+{"question_id": 45, "text": "The unusual aspect of the elderly man's facial appearance in the image is that he has the head of a toothbrush under his nose, which mimics a mustache, specifically a toothbrush mustache.", "category": "conv"}
+{"question_id": 46, "text": "The image features an elderly man humorously wearing the head of a toothbrush under his nose, creating the illusion of a toothbrush mustache. Since the toothbrush head is positioned like a mustache, it is reminiscent of a toothbrush mustache caricature often associated with Hitler. The man takes up most of the frame, appearing front and center in the scene.", "category": "detail"}
+{"question_id": 47, "text": "The elderly man wearing the head of a toothbrush under his nose as a moustache could be conveying a satirical or humorous message. By mimicking a toothbrush mustache, he may be making light of or criticizing a specific historical figure or style associated with the toothbrush mustache, such as Adolf Hitler. Using a toothbrush instead of an actual mustache allows him to dramatically emphasize the point and create a lighthearted or comical effect. However, it's essential to consider cultural and social contexts when interpreting such unconventional expressions, as the message may not always be clear or universally accepted.", "category": "complex"}
+{"question_id": 48, "text": "There are two large passenger airplanes visible in the image.", "category": "conv"}
+{"question_id": 49, "text": "The scene captures a picturesque view of two large passenger jets, one in the air and one on the ground. They are flying over a beautiful beach filled with various birds. Many birds can be seen scattered across the scene, both near the water and around the area. The blue water in the background adds a calming touch to the image.", "category": "detail"}
+{"question_id": 50, "text": "The presence of two large passenger jets flying in close proximity to the beach filled with birds can have a negative impact on the avian population. The noise generated by the engines can be stressful and disruptive for the birds, potentially resulting in altered behaviors, flight patterns, or even relocation from the area. In addition, the airplanes could pose a risk of bird strikes, in which birds collide with the planes, leading to injuries or fatalities for the birds and damage to the aircraft. This highlights the importance of maintaining safe distances between airplanes and wildlife habitats in order to minimize the risk of adverse effects on both the environment and aviation.", "category": "complex"}
+{"question_id": 51, "text": "The cat is curled up and sleeping on the red couch.", "category": "conv"}
+{"question_id": 52, "text": "A tan and black cat is curled up and comfortably asleep on a vibrant red couch. The cat is situated towards the center of the plush seat. The red couch itself is prominent in the image, occupying a large portion of the frame. The peaceful sleeping cat adds a sense of warmth and coziness to the scene.", "category": "detail"}
+{"question_id": 53, "text": "The cat may have chosen to sleep on the red couch for several reasons. The couch appears to be plush and comfortable, providing a soft surface for the cat to curl up and maintain its body heat. Cats also prefer quiet and low-traffic areas to rest. The couch may be located in such an area, providing a sense of security for the cat while sleeping. Furthermore, felines are known to be attracted to soft and warm materials, making the red velvet seat an appealing spot for relaxation. It's also common for cats to sleep in locations familiar to them or near their human companions, so the couch might be a place where the cat often spends time or feels safe.", "category": "complex"}
+{"question_id": 54, "text": "An old Volkswagen Bug is parked next to the stop sign in the image.", "category": "conv"}
+{"question_id": 55, "text": "In the scene, a woman is standing in front of a window, observing the street outside. On the street, a vintage Volkswagen Bug is parked near a stop sign, which is located towards the center of the image. Several other cars occupy the street, including one positioned to the left of the Volkswagen and two others in the background. On the far left and far right sides of the image, there are additional cars partially visible. The street also features a tree growing near the stop sign, adding a touch of greenery to the urban setting.", "category": "detail"}
+{"question_id": 56, "text": "From the image, it can be inferred that the traffic situation in the area is relatively light or calm. There is an old VW Bug parked near a stop sign, and a few other cars are also visible on the street. However, there doesn't seem to be any congestion or significant traffic. The presence of the stop sign indicates that there is some traffic regulation and order in place to manage vehicle movement, contributing to the overall calmness of the area. Moreover, no people are seen near the stop sign, which further suggests that there might be low pedestrian activity at the time the picture was taken.", "category": "complex"}
+{"question_id": 57, "text": "The man in the image is using his cell phone and giving a thumbs up, possibly posing for a picture.", "category": "conv"}
+{"question_id": 58, "text": "The image depicts a group of people gathered inside a room, creating a lively social atmosphere. A man is prominently featured in the scene, holding a cell phone to his ear and smiling with a thumbs-up gesture, possibly posing for a picture or simply enjoying the conversation.\n\nThere are multiple other people in the room, adding to the gathering and engaging in various conversations. The scene also has two cups visible, one on the left side of the room and another on the right side, indicating that the gathering may be a casual social event.", "category": "detail"}
+{"question_id": 59, "text": "It is likely that the man is giving a thumbs-up gesture while talking on the phone to express a positive emotion or to signal agreement, approval, or satisfaction with the conversation he is having. This non-verbal communication can be conveyed to the people around him or to the person taking the picture of him while on the phone call. The man might be receiving good news, agreeing with the person on the phone, or sharing his excitement with the group of people surrounding him in the room. The specific reason cannot be precisely determined from the image alone, but it does indicate a positive sentiment associated with the phone call.", "category": "complex"}
+{"question_id": 60, "text": "The dog in the image is a husky.", "category": "conv"}
+{"question_id": 61, "text": "The image shows a large, adorable husky dog sleeping peacefully on a dog bed in a room. The room has a somewhat dark ambiance, making the scene feel cozy and comfortable. \n\nIn the same room, there are two chairs, one positioned towards the center and another one closer to the right side. Additionally, there are two potted plants, one situated slightly behind and to the right of the dog and the other placed further to the right in the room. The presence of these elements gives the room a warm, inviting atmosphere.", "category": "detail"}
+{"question_id": 62, "text": "The dog owner seems to have taken several measures to make their dog feel comfortable in the room. Based on the image, an adorable husky dog has a designated dog bed to sleep in, providing cushioning and a cozy space for rest. The room is also dimly lit or dark, which contributes to a calmer atmosphere that promotes sleep and relaxation. There might be other considerations, such as room temperature or air circulation, that aid in creating a comfortable environment for the dog. It's essential for pet owners to ensure that their pets have a secure and comfortable space to rest in order to maintain their well-being and happiness.", "category": "complex"}
+{"question_id": 63, "text": "The color of the elephant in the image is grey.", "category": "conv"}
+{"question_id": 64, "text": "The image depicts a baby elephant standing alone inside an enclosure. The elephant is positioned towards the center of the pen with a large sand dune, or dirt mount, nearby. In the background, there is a brick building that serves as part of the exhibit. The grey elephant stands calmly, occupying the majority of the scene in this pen.", "category": "detail"}
+{"question_id": 65, "text": "A potential concern for the baby elephant standing alone inside the pen could be the lack of social interaction with other elephants. Elephants are highly social creatures and form strong bonds within their family units. The absence of other elephants in the enclosure might lead to feelings of loneliness, stress, and anxiety for the baby elephant. It is crucial for the mental and emotional well-being of elephants, especially young ones, to have regular opportunities to interact with their peers and family members in a naturalistic, supportive environment. By addressing this concern, the caretakers of the baby elephant can help ensure its psychological well-being and overall health.", "category": "complex"}
+{"question_id": 66, "text": "On the orange plate, there are a burger or sandwich, a bowl of broccoli, and a pickle.", "category": "conv"}
+{"question_id": 67, "text": "The image features a dining table with an orange plate in the center. On the plate, there is a half of a sandwich, a pickle, and a bowl of broccoli, creating a visually appealing and mouth-watering combination. A fork is also placed alongside the plate, ready to be used for enjoying the meal. Several individual pieces of broccoli can be seen around the bowl, adding variety and color to the scene.", "category": "detail"}
+{"question_id": 68, "text": "Based on the image, it can be inferred that the individual has a balanced approach to their dietary choices. The meal on the table consists of a sandwich, a bowl of broccoli, and a pickle on an orange plate. The sandwich likely provides carbohydrates and proteins, while the broccoli offers essential vitamins, minerals, and fiber. The pickle adds some flavor and tanginess to the meal. This combination of foods illustrates that the person is interested in having a diverse and nutritious diet, incorporating both proteins and vegetables in their meal. Additionally, the presence of a fork suggests that they prefer using cutlery while eating, which can be seen as a more cultured and organized approach to dining.", "category": "complex"}
+{"question_id": 69, "text": "A white duck is pictured in the image.", "category": "conv"}
+{"question_id": 70, "text": "The image features a single white bird, likely a duck or a goose, walking along a paved road. The bird appears to be closely examining the ground as it walks. The road runs adjacent to a patch of grass, providing a contrasting backdrop for the scene. The bird occupies a predominant part of the image, offering a close-up view of its action.", "category": "detail"}
+{"question_id": 71, "text": "While walking along the paved road, the duck may face various challenges. Firstly, the duck could be at risk of encountering vehicles, which might not notice or expect to see wildlife on the road. This could put the duck in danger of being hit or injured by passing traffic. Secondly, the duck might struggle to find appropriate food sources on or near the paved road, as opposed to its natural habitat, such as a body of water or a grassy area, where it would have easier access to its regular diet.\n\nFurthermore, the duck might be exposed to potential predators, like domestic pets or other animals that live in urban environments, increasing the risk of harm for the duck. Lastly, walking on the paved concrete surface could be less comfortable for the duck than walking on softer natural ground, causing discomfort over time. Overall, the duck's safety and well-being may be compromised by navigating this environment that is not adapted to its natural behaviors and needs.", "category": "complex"}
+{"question_id": 72, "text": "Yes, the little boy is wearing a helmet to ensure his safety while riding his bike.", "category": "conv"}
+{"question_id": 73, "text": "The image depicts a little boy riding his bike on a road. He is practicing safety by wearing a yellow helmet while cycling. The boy is visible from a side angle, capturing both him and his bicycle in motion.", "category": "detail"}
+{"question_id": 74, "text": "The little boy's precaution of wearing a helmet is noteworthy because it shows that the child or their guardian is aware of and strictly adhering to safety precautions. Wearing a helmet while riding a bicycle is crucial for reducing the risk of severe head injuries in the event of a crash, fall, or any other unforeseen incident. It is especially important for children, who are often more prone to accidents due to their limited experience with balancing and negotiating obstacles. By wearing a helmet, the boy is demonstrating responsible behavior and setting a positive example for others to follow, promoting a culture of safety awareness.", "category": "complex"}
+{"question_id": 75, "text": "There are three people visible in the image - two women and one man.", "category": "conv"}
+{"question_id": 76, "text": "The image shows a group of people at a celebration, likely participating in a wine tasting event. There are a total of six people, with three young adults prominently in the foreground. A man wearing glasses stands in the middle, flanked by a couple of women on either side. They are all holding wine glasses, enjoying the event and smiling.\n\nAt the table, there are several wine bottles dispersed along its surface, showing the variety of options available for tasting. Additionally, there is a handbag placed on the table, possibly belonging to one of the women attending the event. \n\nTwo more people are visible in the background. They are smiling and making conversation while engaging in the celebration with the others.", "category": "detail"}
+{"question_id": 77, "text": "It appears that the group of people are gathered for a wine tasting or celebration in which they are enjoying various types of wine. They are seen standing and sitting around a table with several wine bottles and holding wine glasses. Their smiles and the friendly atmosphere indicate that it's a social event where they have gathered to indulge in sampling different wines, engage in conversations, and enjoy each other's company.", "category": "complex"}
+{"question_id": 78, "text": "The main object featured in the image is the clock mounted on a pole or a short tower, standing on the sidewalk next to a building.", "category": "conv"}
+{"question_id": 79, "text": "The image features a quiet street with a tall clock mounted on a pole at the side of the pavement, near a building. The pole stands prominently on the empty sidewalk, offering a convenient time-keeping source for pedestrians.\n\nThere are a few potted plants along the sidewalk, adding a touch of greenery to the scene. Bicycles can also be seen parked nearby, and a couple of cars are parked further down the street. A person is walking in the background, adding some life to the otherwise calm environment.", "category": "detail"}
+{"question_id": 80, "text": "As you walk down this quiet street, you can easily tell the time by looking at the tall clock that is mounted on a pole along the sidewalk. This clock stands out significantly, making it easily noticeable. Regarding the direction of the sidewalk, the presence of potted plants, bicycles, and cars suggest that this is a well-maintained and organized street. As you follow the sidewalk further, you will see that it is lined with trees, potted plants, and parked bicycles and cars, guiding you in the direction it leads. Additionally, the building next to the clock also gives you context for the orientation and direction of the street.", "category": "complex"}
+{"question_id": 81, "text": "The person is riding a motorbike, which can also be referred to as a motorcycle or a moped. In this case, the terms motorcycle, scooter, and moped are used interchangeably to describe the same vehicle the woman is riding.", "category": "conv"}
+{"question_id": 82, "text": "The scene showcases a dirt road, with a woman riding a motorcycle in the middle of the road. She is accompanied by a man walking alongside her, with another man further down the road. Various animals, including three cows, are also present in the scene, suggesting a rural setting. The main cow is positioned to the right of the motorcycle while two smaller cows can be seen on the left side of the road. The people and animals all appear to be going in the same direction down the dirt road.", "category": "detail"}
+{"question_id": 83, "text": "The woman on the motorcycle might face challenges navigating the dirt road, especially because of the presence of multiple objects and living beings on the road. In the image, a cow is nearby, along with two other smaller cows, two men, and a dog. Dirt roads can be uneven, dusty, and unpredictable, making it important for the woman to maneuver cautiously to avoid any potential accidents. Additionally, she needs to be aware of the animals and people around her to ensure both her safety and the safety of others. This situation requires her to be vigilant, slow down when necessary, and carefully navigate through the environment.", "category": "complex"}
+{"question_id": 84, "text": "The skateboard in the image is in an upside-down position, with its wheels pointing up and laying on the ground.", "category": "conv"}
+{"question_id": 85, "text": "In the image, a person wearing camo shorts is standing on a cement surface next to an overturned skateboard. The skateboard is laying upside down with its wheels pointing up, giving a sense of it having been flipped or potentially used for a trick. The boy appears to be standing in front of the skateboard, engaging with it or possibly preparing for his next move.", "category": "detail"}
+{"question_id": 86, "text": "It's plausible that prior to this moment, the person standing next to the overturned skateboard might have attempted a trick or maneuver on the skateboard, which resulted in either a failed attempt or a wipeout. As a consequence, the skateboard ended up upside down on the ground, with its wheels pointed upward. The person, possibly the skateboarder, is standing next to it, possibly pondering their next move, contemplating a retry or assessing any potential damage to the skateboard.", "category": "complex"}
+{"question_id": 87, "text": "The man and woman are wearing umbrella hats, which are designed to provide protection from the rain.", "category": "conv"}
+{"question_id": 88, "text": "The image depicts a man and a woman sitting on top of a bridge, both wearing umbrella hats. The quirky umbrella hats provide them with some cover from the rainy weather. A dog is accompanying the couple, taking a break with them on the bridge. \n\nThe surrounding area has a view of the ocean, and there are boats visible in the water. One boat can be found close to the right edge of the scene, while two other boats are in the middle and the far left side of the frame. Additional umbrellas can be spotted in the area, signifying that other pedestrians have also sought shelter from the rain.", "category": "detail"}
+{"question_id": 89, "text": "On this rainy day, the umbrella hats provide the couple several benefits as they sit on top of a bridge near the ocean with their dog. First, the umbrella hats enable them to keep their hands free, allowing them to hold their dog's leash, interact with each other, and perform other tasks without holding a traditional umbrella. Second, the umbrella hats protect their heads and upper bodies from getting wet due to rain, making their outdoor experience more comfortable. Additionally, the novel and fun nature of the umbrella hats can be a conversation starter and bring some humor to their day despite the gloomy weather.", "category": "complex"}

ChatUniVi/eval/questions/coco2014_val_qa_eval/qa90_questions.jsonl ADDED Viewed

	@@ -0,0 +1,90 @@

+{"question_id": 0, "image": "COCO_val2014_000000441147.jpg", "text": "What is the color of the two suitcases in the image?", "category": "conv"}
+{"question_id": 1, "image": "COCO_val2014_000000441147.jpg", "text": "Analyze the image in a comprehensive and detailed manner.", "category": "detail"}
+{"question_id": 2, "image": "COCO_val2014_000000441147.jpg", "text": "What potential factors could make these suitcases valuable?", "category": "complex"}
+{"question_id": 3, "image": "COCO_val2014_000000353536.jpg", "text": "What are the main objects on the table in the image?", "category": "conv"}
+{"question_id": 4, "image": "COCO_val2014_000000353536.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 5, "image": "COCO_val2014_000000353536.jpg", "text": "What activity might have recently taken place around this table and what could be its aftermath?", "category": "complex"}
+{"question_id": 6, "image": "COCO_val2014_000000506095.jpg", "text": "What is the main focus of the image?", "category": "conv"}
+{"question_id": 7, "image": "COCO_val2014_000000506095.jpg", "text": "What is this photo about'?", "category": "detail"}
+{"question_id": 8, "image": "COCO_val2014_000000506095.jpg", "text": "What could be a reason for the cat's interest in the laptop?", "category": "complex"}
+{"question_id": 9, "image": "COCO_val2014_000000514915.jpg", "text": "What color is the dog in the image?", "category": "conv"}
+{"question_id": 10, "image": "COCO_val2014_000000514915.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 11, "image": "COCO_val2014_000000514915.jpg", "text": "Why might the dog have chosen to lay down on the floor in this location?", "category": "complex"}
+{"question_id": 12, "image": "COCO_val2014_000000056013.jpg", "text": "Where is the cart with luggage bags located?", "category": "conv"}
+{"question_id": 13, "image": "COCO_val2014_000000056013.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 14, "image": "COCO_val2014_000000056013.jpg", "text": "What challenges might the staff encounter with handling luggage in the lobby?", "category": "complex"}
+{"question_id": 15, "image": "COCO_val2014_000000408439.jpg", "text": "What is the body of water seen in the image?", "category": "conv"}
+{"question_id": 16, "image": "COCO_val2014_000000408439.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 17, "image": "COCO_val2014_000000408439.jpg", "text": "What are potential reasons for the train to be traveling in such a scenic route?", "category": "complex"}
+{"question_id": 18, "image": "COCO_val2014_000000164255.jpg", "text": "What is the main object in the image?", "category": "conv"}
+{"question_id": 19, "image": "COCO_val2014_000000164255.jpg", "text": "What do you think is going on in this snapshot?", "category": "detail"}
+{"question_id": 20, "image": "COCO_val2014_000000164255.jpg", "text": "What purpose does the umbrella serve on the beach?", "category": "complex"}
+{"question_id": 21, "image": "COCO_val2014_000000034096.jpg", "text": "What stage is the house in regarding its construction?", "category": "conv"}
+{"question_id": 22, "image": "COCO_val2014_000000034096.jpg", "text": "What is this photo about'?", "category": "detail"}
+{"question_id": 23, "image": "COCO_val2014_000000034096.jpg", "text": "What may be a potential obstacle for the house construction in its current state?", "category": "complex"}
+{"question_id": 24, "image": "COCO_val2014_000000385873.jpg", "text": "How many pizzas are there in the image?", "category": "conv"}
+{"question_id": 25, "image": "COCO_val2014_000000385873.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 26, "image": "COCO_val2014_000000385873.jpg", "text": "What are the possible challenges one might face when dividing these pizzas equitably among a group of people?", "category": "complex"}
+{"question_id": 27, "image": "COCO_val2014_000000367571.jpg", "text": "How many doughnuts are in the box?", "category": "conv"}
+{"question_id": 28, "image": "COCO_val2014_000000367571.jpg", "text": "Write a detailed description of the given image.", "category": "detail"}
+{"question_id": 29, "image": "COCO_val2014_000000367571.jpg", "text": "What unique taste combinations can be found in this box of doughnuts?", "category": "complex"}
+{"question_id": 30, "image": "COCO_val2014_000000319432.jpg", "text": "What is the man doing in the image?", "category": "conv"}
+{"question_id": 31, "image": "COCO_val2014_000000319432.jpg", "text": "Can you describe the main features of this image for me?", "category": "detail"}
+{"question_id": 32, "image": "COCO_val2014_000000319432.jpg", "text": "Why might the man be holding the clothes on hangers?", "category": "complex"}
+{"question_id": 33, "image": "COCO_val2014_000000225738.jpg", "text": "How many giraffes are depicted in the image?", "category": "conv"}
+{"question_id": 34, "image": "COCO_val2014_000000225738.jpg", "text": "Write a detailed description of the given image.", "category": "detail"}
+{"question_id": 35, "image": "COCO_val2014_000000225738.jpg", "text": "Why might these giraffes be gathering near the same tree?", "category": "complex"}
+{"question_id": 36, "image": "COCO_val2014_000000092109.jpg", "text": "What is the main focus of the image?", "category": "conv"}
+{"question_id": 37, "image": "COCO_val2014_000000092109.jpg", "text": "What is this photo about'?", "category": "detail"}
+{"question_id": 38, "image": "COCO_val2014_000000092109.jpg", "text": "What can be inferred about the giraffe's habitat from this image?", "category": "complex"}
+{"question_id": 39, "image": "COCO_val2014_000000506483.jpg", "text": "What kind of objects are included in the art installation?", "category": "conv"}
+{"question_id": 40, "image": "COCO_val2014_000000506483.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 41, "image": "COCO_val2014_000000506483.jpg", "text": "How might the art installation encourage interaction among people?", "category": "complex"}
+{"question_id": 42, "image": "COCO_val2014_000000097131.jpg", "text": "What is the main focus of the image?", "category": "conv"}
+{"question_id": 43, "image": "COCO_val2014_000000097131.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 44, "image": "COCO_val2014_000000097131.jpg", "text": "What might be the reason for the car to be parked on the side of the road?", "category": "complex"}
+{"question_id": 45, "image": "COCO_val2014_000000052312.jpg", "text": "What is unusual about the elderly man's facial appearance in the image?", "category": "conv"}
+{"question_id": 46, "image": "COCO_val2014_000000052312.jpg", "text": "What's happening in the scene?", "category": "detail"}
+{"question_id": 47, "image": "COCO_val2014_000000052312.jpg", "text": "What message could be conveyed by the man's unusual facial adornment?", "category": "complex"}
+{"question_id": 48, "image": "COCO_val2014_000000258285.jpg", "text": "How many airplanes are visible in the image?", "category": "conv"}
+{"question_id": 49, "image": "COCO_val2014_000000258285.jpg", "text": "What do you see happening in this image?", "category": "detail"}
+{"question_id": 50, "image": "COCO_val2014_000000258285.jpg", "text": "What environmental impact might these planes' activity have on the birds?", "category": "complex"}
+{"question_id": 51, "image": "COCO_val2014_000000081552.jpg", "text": "What is the cat doing on the red couch?", "category": "conv"}
+{"question_id": 52, "image": "COCO_val2014_000000081552.jpg", "text": "Write a detailed description of the given image.", "category": "detail"}
+{"question_id": 53, "image": "COCO_val2014_000000081552.jpg", "text": "What factors may have led the cat to choose this location for sleep?", "category": "complex"}
+{"question_id": 54, "image": "COCO_val2014_000000066144.jpg", "text": "What type of car is parked next to the stop sign in the image?", "category": "conv"}
+{"question_id": 55, "image": "COCO_val2014_000000066144.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 56, "image": "COCO_val2014_000000066144.jpg", "text": "What can be inferred about the traffic situation in the area?", "category": "complex"}
+{"question_id": 57, "image": "COCO_val2014_000000203629.jpg", "text": "What is the man doing in the image?", "category": "conv"}
+{"question_id": 58, "image": "COCO_val2014_000000203629.jpg", "text": "Can you describe the main features of this image for me?", "category": "detail"}
+{"question_id": 59, "image": "COCO_val2014_000000203629.jpg", "text": "What might be the reason behind the man's thumbs-up gesture while on the phone?", "category": "complex"}
+{"question_id": 60, "image": "COCO_val2014_000000109532.jpg", "text": "What breed is the dog in the image?", "category": "conv"}
+{"question_id": 61, "image": "COCO_val2014_000000109532.jpg", "text": "What are the key elements in this picture?", "category": "detail"}
+{"question_id": 62, "image": "COCO_val2014_000000109532.jpg", "text": "What measures may be taken by the dog's owner to make the dog feel comfortable in the room?", "category": "complex"}
+{"question_id": 63, "image": "COCO_val2014_000000431165.jpg", "text": "What is the color of the elephant in the image?", "category": "conv"}
+{"question_id": 64, "image": "COCO_val2014_000000431165.jpg", "text": "What do you think is going on in this snapshot?", "category": "detail"}
+{"question_id": 65, "image": "COCO_val2014_000000431165.jpg", "text": "What could be a potential concern for the baby elephant in this setting?", "category": "complex"}
+{"question_id": 66, "image": "COCO_val2014_000000020650.jpg", "text": "What type of food items are present on the orange plate?", "category": "conv"}
+{"question_id": 67, "image": "COCO_val2014_000000020650.jpg", "text": "What are the key elements in this picture?", "category": "detail"}
+{"question_id": 68, "image": "COCO_val2014_000000020650.jpg", "text": "What can be inferred about the individual's dietary choices?", "category": "complex"}
+{"question_id": 69, "image": "COCO_val2014_000000205183.jpg", "text": "What type of bird is pictured in the image?", "category": "conv"}
+{"question_id": 70, "image": "COCO_val2014_000000205183.jpg", "text": "What do you see happening in this image?", "category": "detail"}
+{"question_id": 71, "image": "COCO_val2014_000000205183.jpg", "text": "What challenges could the duck face while walking along the paved road?", "category": "complex"}
+{"question_id": 72, "image": "COCO_val2014_000000210299.jpg", "text": "Is the little boy wearing any safety gear while riding his bike?", "category": "conv"}
+{"question_id": 73, "image": "COCO_val2014_000000210299.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 74, "image": "COCO_val2014_000000210299.jpg", "text": "Why is the little boy's precaution noteworthy?", "category": "complex"}
+{"question_id": 75, "image": "COCO_val2014_000000515716.jpg", "text": "How many people are visible in the image?", "category": "conv"}
+{"question_id": 76, "image": "COCO_val2014_000000515716.jpg", "text": "What do you see happening in this image?", "category": "detail"}
+{"question_id": 77, "image": "COCO_val2014_000000515716.jpg", "text": "What might be the reason behind this gathering?", "category": "complex"}
+{"question_id": 78, "image": "COCO_val2014_000000460149.jpg", "text": "What is the main object featured in the image?", "category": "conv"}
+{"question_id": 79, "image": "COCO_val2014_000000460149.jpg", "text": "Describe the following image.", "category": "detail"}
+{"question_id": 80, "image": "COCO_val2014_000000460149.jpg", "text": "Imagine you are walking down this street; how would you know the time or which direction the sidewalk leads?", "category": "complex"}
+{"question_id": 81, "image": "COCO_val2014_000000293505.jpg", "text": "Is the person riding a motorbike or scooter?", "category": "conv"}
+{"question_id": 82, "image": "COCO_val2014_000000293505.jpg", "text": "Explain the visual content of the image in great detail.", "category": "detail"}
+{"question_id": 83, "image": "COCO_val2014_000000293505.jpg", "text": "What challenges might the woman on the motorcycle face in this situation?", "category": "complex"}
+{"question_id": 84, "image": "COCO_val2014_000000525439.jpg", "text": "What is the position of the skateboard in the image?", "category": "conv"}
+{"question_id": 85, "image": "COCO_val2014_000000525439.jpg", "text": "Can you elaborate on the elements of the picture provided?", "category": "detail"}
+{"question_id": 86, "image": "COCO_val2014_000000525439.jpg", "text": "What might have happened prior to this moment?", "category": "complex"}
+{"question_id": 87, "image": "COCO_val2014_000000534270.jpg", "text": "What type of hats are the man and woman wearing?", "category": "conv"}
+{"question_id": 88, "image": "COCO_val2014_000000534270.jpg", "text": "Can you describe the main features of this image for me?", "category": "detail"}
+{"question_id": 89, "image": "COCO_val2014_000000534270.jpg", "text": "What benefits do umbrella hats provide to the couple on this rainy day?", "category": "complex"}

ChatUniVi/eval/questions/coco_pope/coco_pope_adversarial.jsonl ADDED Viewed