VideoScore

Runtime error

App Files Files Community

DongfuJiang commited on May 15, 2024

Commit

b9d3de1

1 Parent(s): 0afd9ef

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +92 -44
examples/2006733/2006733_00.jpg +0 -0
examples/2006733/2006733_01.jpg +0 -0
examples/2006733/2006733_02.jpg +0 -0
examples/2006733/2006733_03.jpg +0 -0
examples/2006733/2006733_04.jpg +0 -0
examples/2006733/2006733_05.jpg +0 -0
examples/2006733/2006733_06.jpg +0 -0
examples/2006733/2006733_07.jpg +0 -0
examples/2006733/2006733_08.jpg +0 -0
examples/2006733/2006733_09.jpg +0 -0
examples/2006733/2006733_10.jpg +0 -0
examples/2006733/2006733_11.jpg +0 -0
examples/2006733/2006733_12.jpg +0 -0
examples/2006733/2006733_13.jpg +0 -0
examples/2006733/2006733_14.jpg +0 -0
examples/2006733/2006733_15.jpg +0 -0
examples/data_subset.json +368 -0
examples/r000228/r000228_00.jpg +0 -0
examples/r000228/r000228_01.jpg +0 -0
examples/r000228/r000228_02.jpg +0 -0
examples/r000228/r000228_03.jpg +0 -0
examples/r000228/r000228_04.jpg +0 -0
examples/r000228/r000228_05.jpg +0 -0
examples/r000228/r000228_06.jpg +0 -0
examples/r000228/r000228_07.jpg +0 -0
examples/r000228/r000228_08.jpg +0 -0
examples/r000228/r000228_09.jpg +0 -0
examples/r000228/r000228_10.jpg +0 -0
examples/r000228/r000228_11.jpg +0 -0
examples/r000228/r000228_12.jpg +0 -0
examples/r000228/r000228_13.jpg +0 -0
examples/r000228/r000228_14.jpg +0 -0
examples/r000228/r000228_15.jpg +0 -0
examples/r000228/r000228_16.jpg +0 -0
examples/r000228/r000228_17.jpg +0 -0
examples/r000228/r000228_18.jpg +0 -0
examples/r000228/r000228_19.jpg +0 -0
examples/r000228/r000228_20.jpg +0 -0
examples/r000228/r000228_21.jpg +0 -0
examples/r000228/r000228_22.jpg +0 -0
examples/r000228/r000228_23.jpg +0 -0
examples/r000688/r000688_00.jpg +0 -0
examples/r000688/r000688_01.jpg +0 -0
examples/r000688/r000688_02.jpg +0 -0
examples/r000688/r000688_03.jpg +0 -0
examples/r000688/r000688_04.jpg +0 -0
examples/r000688/r000688_05.jpg +0 -0
examples/r000688/r000688_06.jpg +0 -0
examples/r000688/r000688_07.jpg +0 -0

app.py CHANGED Viewed

@@ -2,33 +2,81 @@ import gradio as gr
 import spaces
 import os
 import time
 from PIL import Image
 import functools
-from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava_stream, MLlavaForConditionalGeneration, chat_mllava
 from models.conversation import conv_templates
 from typing import List
-processor = MLlavaProcessor.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
-model = LlavaForConditionalGeneration.from_pretrained("TIGER-Lab/Mantis-8B-siglip-llama3")
-conv_template = conv_templates['llama_3']
-@spaces.GPU
-def generate_stream(text:str, images:List[Image.Image], history: List[dict], **kwargs):
-    global processor, model
-    model = model.to("cuda")
-    if not images:
-        images = None
-    for text, history in chat_mllava_stream(text, images, model, processor, history=history, **kwargs):
-        yield text
-    return text
 @spaces.GPU
 def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     global processor, model
-    model = model.to("cuda")
     if not images:
         images = None
-    generated_text, history = chat_mllava(text, images, model, processor, history=history, **kwargs)
     return generated_text
 def enable_next_image(uploaded_images, image):
@@ -72,7 +120,6 @@ def get_chat_images(history):
 def bot(history):
-    print(history)
     cur_messages = {"text": "", "images": []}
     for message in history[::-1]:
         if message[1]:
@@ -87,7 +134,7 @@ def bot(history):
         raise gr.Error("Please enter a message")
     if cur_messages['text'].count("<image>") < len(cur_messages['images']):
         gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
-        cur_messages['text'] = "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>")) + cur_messages['text']
         history[-1][0] = cur_messages["text"]
     if cur_messages['text'].count("<image>") > len(cur_messages['images']):
         gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
@@ -105,13 +152,24 @@ def bot(history):
         "do_sample": False
     }
-    response = generate_stream(None, chat_images, chat_history, **generation_kwargs)
-    for _output in response:
-        history[-1][1] = _output
-        time.sleep(0.05)
-        yield history
 def build_demo():
     with gr.Blocks() as demo:
@@ -166,31 +224,21 @@ Mantis is a multimodal conversational AI model that can chat with users about im
         ).then(
             bot, chatbot, chatbot, api_name="bot_response"
         )
         gr.Examples(
             examples=[
-                {
-                    "text": "<image> <image> <image> Which image shows a different mood of character from the others?",
-                    "files": ["./examples/image12.jpg", "./examples/image13.jpg", "./examples/image14.jpg"]
-                },
-                {
-                    "text": "<image> <image> What's the difference between these two images? Please describe as much as you can.",
-                    "files": ["./examples/image1.jpg", "./examples/image2.jpg"]
-                },
-                {
-                    "text": "<image> <image> Which image shows an older dog?",
-                    "files": ["./examples/image8.jpg", "./examples/image9.jpg"]
-                },
-                {
-                    "text": "Write a description for the given image sequence in a single paragraph, what is happening in this episode?",
-                    "files": ["./examples/image3.jpg", "./examples/image4.jpg", "./examples/image5.jpg", "./examples/image6.jpg", "./examples/image7.jpg"]
-                },
-                {
-                    "text": "<image> <image> How many dices are there in image 1 and image 2 respectively?",
-                    "files": ["./examples/image10.jpg", "./examples/image15.jpg"]
-                },
             ],
-            inputs=[chat_input],
         )
         gr.Markdown("""

 import spaces
 import os
 import time
+import json
 from PIL import Image
 import functools
+from transformers import AutoProcessor, Idefics2ForConditionalGeneration
 from models.conversation import conv_templates
 from typing import List
+processor = AutoProcessor.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
+model = Idefics2ForConditionalGeneration.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
+conv_template = conv_templates["idefics_2"]
+with open("./examples/data_subset.json", 'r') as f:
+    examples = json.load(f)
+for item in examples:
+    video_id = item['images'][0].split("_")[0]
+    item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
+prompt = "Suppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the beautiful girl, long hair,walk on the sity street, red cloth  ,\nall the frames of video are as follows: \n\n"
 @spaces.GPU
 def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     global processor, model
+    model = model.to("cuda") if model.device.type != "cuda" else model
     if not images:
         images = None
+    user_role = conv_template.roles[0]
+    assistant_role = conv_template.roles[1]
+    idefics_2_message = []
+    cur_img_idx = 0
+    print(history)
+    for i, message in enumerate(history):
+        if message["role"] == user_role:
+            idefics_2_message.append({
+                "role": user_role,
+                "content": []
+            })
+            message_text = message["text"]
+            num_image_tokens_in_text = message_text.count("<image>")
+            if num_image_tokens_in_text > 0:
+                sub_texts = [x.strip() for x in message_text.split("<image>")]
+                if sub_texts[0]:
+                    idefics_2_message[-1]["content"].append({"type": "text", "text": sub_texts[0]})
+                for sub_text in sub_texts[1:]:
+                    idefics_2_message[-1]["content"].append({"type": "image"})
+                    if sub_text:
+                        idefics_2_message.append({
+                            "role": user_role,
+                            "content": [{"type": "text", "text": sub_text}]
+                        })
+            else:
+                idefics_2_message[-1]["content"].append({"type": "text", "text": message_text})
+        elif message["role"] == assistant_role:
+            if i == len(history) - 1 and not message["text"]:
+                break
+            idefics_2_message.append({
+                "role": assistant_role,
+                "content": [{"type": "text", "text": message["text"]}]
+            })
+    if text:
+        assert idefics_2_message[-1]["role"] == assistant_role and not idefics_2_message[-1]["content"], "Internal error"
+        idefics_2_message.append({
+            "role": user_role,
+            "content": [{"type": "text", "text": text}]
+        })
+    print(idefics_2_message)
+    prompt = processor.apply_chat_template(idefics_2_message, add_generation_prompt=True)
+    images = [Image.open(x) for x in images]
+    inputs = processor(text=prompt, images=images, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    outputs = model.generate(**inputs, max_new_tokens=1024)
+    generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
     return generated_text
 def enable_next_image(uploaded_images, image):
 def bot(history):
     cur_messages = {"text": "", "images": []}
     for message in history[::-1]:
         if message[1]:
         raise gr.Error("Please enter a message")
     if cur_messages['text'].count("<image>") < len(cur_messages['images']):
         gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
+        cur_messages['text'] += "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>"))
         history[-1][0] = cur_messages["text"]
     if cur_messages['text'].count("<image>") > len(cur_messages['images']):
         gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
         "do_sample": False
     }
+    response = generate(None, chat_images, chat_history, **generation_kwargs)
+    return response
+    # for _output in response:
+    #     history[-1][1] = _output
+    #     time.sleep(0.05)
+    #     yield history
+def get_images(video_folder:str):
+    """
+    video folder contains images files like {video_folder_name}_00.jpg, {video_folder_name}_01.jpg, ...
+    """
+    images = []
+    for file in os.listdir(video_folder):
+        if file.endswith(".jpg"):
+            images.append(Image.open(os.path.join(video_folder, file)))
+    # sort images by name
+    images.sort(key=lambda x: int(x.filename.split("_")[-1].split(".")[0]))
+    return images
 def build_demo():
     with gr.Blocks() as demo:
         ).then(
             bot, chatbot, chatbot, api_name="bot_response"
         )
+        dummy_id = gr.Textbox("dummy_id", label="dummy_id", visible=False)
+        dummy_output = gr.Textbox("dummy_output", label="dummy_output", visible=False)
         gr.Examples(
             examples=[
+                [
+                    item['id'],
+                    {
+                        "text": item['conversations'][0]['value'],
+                        "files": item['images']
+                    },
+                    item['conversations'][1]['value']
+                ] for item in examples
             ],
+            inputs=[dummy_id, chat_input, dummy_output],
         )
         gr.Markdown("""

examples/2006733/2006733_00.jpg ADDED Viewed

examples/2006733/2006733_01.jpg ADDED Viewed

examples/2006733/2006733_02.jpg ADDED Viewed

examples/2006733/2006733_03.jpg ADDED Viewed

examples/2006733/2006733_04.jpg ADDED Viewed

examples/2006733/2006733_05.jpg ADDED Viewed

examples/2006733/2006733_06.jpg ADDED Viewed

examples/2006733/2006733_07.jpg ADDED Viewed

examples/2006733/2006733_08.jpg ADDED Viewed

examples/2006733/2006733_09.jpg ADDED Viewed

examples/2006733/2006733_10.jpg ADDED Viewed

examples/2006733/2006733_11.jpg ADDED Viewed

examples/2006733/2006733_12.jpg ADDED Viewed

examples/2006733/2006733_13.jpg ADDED Viewed

examples/2006733/2006733_14.jpg ADDED Viewed

examples/2006733/2006733_15.jpg ADDED Viewed

examples/data_subset.json ADDED Viewed

	@@ -0,0 +1,368 @@

+[
+    {
+        "id": "r003679",
+        "images": [
+            "r003679_00.jpg",
+            "r003679_01.jpg",
+            "r003679_02.jpg",
+            "r003679_03.jpg",
+            "r003679_04.jpg",
+            "r003679_05.jpg",
+            "r003679_06.jpg",
+            "r003679_07.jpg",
+            "r003679_08.jpg",
+            "r003679_09.jpg",
+            "r003679_10.jpg",
+            "r003679_11.jpg",
+            "r003679_12.jpg",
+            "r003679_13.jpg",
+            "r003679_14.jpg",
+            "r003679_15.jpg",
+            "r003679_16.jpg",
+            "r003679_17.jpg",
+            "r003679_18.jpg",
+            "r003679_19.jpg",
+            "r003679_20.jpg",
+            "r003679_21.jpg",
+            "r003679_22.jpg",
+            "r003679_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r100916",
+        "images": [
+            "r100916_00.jpg",
+            "r100916_01.jpg",
+            "r100916_02.jpg",
+            "r100916_03.jpg",
+            "r100916_04.jpg",
+            "r100916_05.jpg",
+            "r100916_06.jpg",
+            "r100916_07.jpg",
+            "r100916_08.jpg",
+            "r100916_09.jpg",
+            "r100916_10.jpg",
+            "r100916_11.jpg",
+            "r100916_12.jpg",
+            "r100916_13.jpg",
+            "r100916_14.jpg",
+            "r100916_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r004061",
+        "images": [
+            "r004061_00.jpg",
+            "r004061_01.jpg",
+            "r004061_02.jpg",
+            "r004061_03.jpg",
+            "r004061_04.jpg",
+            "r004061_05.jpg",
+            "r004061_06.jpg",
+            "r004061_07.jpg",
+            "r004061_08.jpg",
+            "r004061_09.jpg",
+            "r004061_10.jpg",
+            "r004061_11.jpg",
+            "r004061_12.jpg",
+            "r004061_13.jpg",
+            "r004061_14.jpg",
+            "r004061_15.jpg",
+            "r004061_16.jpg",
+            "r004061_17.jpg",
+            "r004061_18.jpg",
+            "r004061_19.jpg",
+            "r004061_20.jpg",
+            "r004061_21.jpg",
+            "r004061_22.jpg",
+            "r004061_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r000228",
+        "images": [
+            "r000228_00.jpg",
+            "r000228_01.jpg",
+            "r000228_02.jpg",
+            "r000228_03.jpg",
+            "r000228_04.jpg",
+            "r000228_05.jpg",
+            "r000228_06.jpg",
+            "r000228_07.jpg",
+            "r000228_08.jpg",
+            "r000228_09.jpg",
+            "r000228_10.jpg",
+            "r000228_11.jpg",
+            "r000228_12.jpg",
+            "r000228_13.jpg",
+            "r000228_14.jpg",
+            "r000228_15.jpg",
+            "r000228_16.jpg",
+            "r000228_17.jpg",
+            "r000228_18.jpg",
+            "r000228_19.jpg",
+            "r000228_20.jpg",
+            "r000228_21.jpg",
+            "r000228_22.jpg",
+            "r000228_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is two people get into sports car,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r100887",
+        "images": [
+            "r100887_00.jpg",
+            "r100887_01.jpg",
+            "r100887_02.jpg",
+            "r100887_03.jpg",
+            "r100887_04.jpg",
+            "r100887_05.jpg",
+            "r100887_06.jpg",
+            "r100887_07.jpg",
+            "r100887_08.jpg",
+            "r100887_09.jpg",
+            "r100887_10.jpg",
+            "r100887_11.jpg",
+            "r100887_12.jpg",
+            "r100887_13.jpg",
+            "r100887_14.jpg",
+            "r100887_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A person using a spatula to spread something on a pan.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r104340",
+        "images": [
+            "r104340_00.jpg",
+            "r104340_01.jpg",
+            "r104340_02.jpg",
+            "r104340_03.jpg",
+            "r104340_04.jpg",
+            "r104340_05.jpg",
+            "r104340_06.jpg",
+            "r104340_07.jpg",
+            "r104340_08.jpg",
+            "r104340_09.jpg",
+            "r104340_10.jpg",
+            "r104340_11.jpg",
+            "r104340_12.jpg",
+            "r104340_13.jpg",
+            "r104340_14.jpg",
+            "r104340_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A bartender is making an alcoholic drink by pouring a clear liquid into a glass with ice cubes.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r001599",
+        "images": [
+            "r001599_00.jpg",
+            "r001599_01.jpg",
+            "r001599_02.jpg",
+            "r001599_03.jpg",
+            "r001599_04.jpg",
+            "r001599_05.jpg",
+            "r001599_06.jpg",
+            "r001599_07.jpg",
+            "r001599_08.jpg",
+            "r001599_09.jpg",
+            "r001599_10.jpg",
+            "r001599_11.jpg",
+            "r001599_12.jpg",
+            "r001599_13.jpg",
+            "r001599_14.jpg",
+            "r001599_15.jpg",
+            "r001599_16.jpg",
+            "r001599_17.jpg",
+            "r001599_18.jpg",
+            "r001599_19.jpg",
+            "r001599_20.jpg",
+            "r001599_21.jpg",
+            "r001599_22.jpg",
+            "r001599_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man uses a camera,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r101298",
+        "images": [
+            "r101298_00.jpg",
+            "r101298_01.jpg",
+            "r101298_02.jpg",
+            "r101298_03.jpg",
+            "r101298_04.jpg",
+            "r101298_05.jpg",
+            "r101298_06.jpg",
+            "r101298_07.jpg",
+            "r101298_08.jpg",
+            "r101298_09.jpg",
+            "r101298_10.jpg",
+            "r101298_11.jpg",
+            "r101298_12.jpg",
+            "r101298_13.jpg",
+            "r101298_14.jpg",
+            "r101298_15.jpg",
+            "r101298_16.jpg",
+            "r101298_17.jpg",
+            "r101298_18.jpg",
+            "r101298_19.jpg",
+            "r101298_20.jpg",
+            "r101298_21.jpg",
+            "r101298_22.jpg",
+            "r101298_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A baseball game is being played in a stadium with a large crowd of people watching.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r000688",
+        "images": [
+            "r000688_00.jpg",
+            "r000688_01.jpg",
+            "r000688_02.jpg",
+            "r000688_03.jpg",
+            "r000688_04.jpg",
+            "r000688_05.jpg",
+            "r000688_06.jpg",
+            "r000688_07.jpg",
+            "r000688_08.jpg",
+            "r000688_09.jpg",
+            "r000688_10.jpg",
+            "r000688_11.jpg",
+            "r000688_12.jpg",
+            "r000688_13.jpg",
+            "r000688_14.jpg",
+            "r000688_15.jpg",
+            "r000688_16.jpg",
+            "r000688_17.jpg",
+            "r000688_18.jpg",
+            "r000688_19.jpg",
+            "r000688_20.jpg",
+            "r000688_21.jpg",
+            "r000688_22.jpg",
+            "r000688_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the child furthest to the right dips her hand into the bowl shared by the other children.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "r104350",
+        "images": [
+            "r104350_00.jpg",
+            "r104350_01.jpg",
+            "r104350_02.jpg",
+            "r104350_03.jpg",
+            "r104350_04.jpg",
+            "r104350_05.jpg",
+            "r104350_06.jpg",
+            "r104350_07.jpg",
+            "r104350_08.jpg",
+            "r104350_09.jpg",
+            "r104350_10.jpg",
+            "r104350_11.jpg",
+            "r104350_12.jpg",
+            "r104350_13.jpg",
+            "r104350_14.jpg",
+            "r104350_15.jpg",
+            "r104350_16.jpg",
+            "r104350_17.jpg",
+            "r104350_18.jpg",
+            "r104350_19.jpg",
+            "r104350_20.jpg",
+            "r104350_21.jpg",
+            "r104350_22.jpg",
+            "r104350_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is talking to a young woman in a room with books on the shelves.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    }
+]