DongfuJiang commited on
Commit
5854fc5
1 Parent(s): 74aab33
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +3 -1
  2. app.py +127 -160
  3. examples/1006309.mp4 +0 -0
  4. examples/2006733/2006733_00.jpg +0 -0
  5. examples/2006733/2006733_01.jpg +0 -0
  6. examples/2006733/2006733_02.jpg +0 -0
  7. examples/2006733/2006733_03.jpg +0 -0
  8. examples/2006733/2006733_04.jpg +0 -0
  9. examples/2006733/2006733_05.jpg +0 -0
  10. examples/2006733/2006733_06.jpg +0 -0
  11. examples/2006733/2006733_07.jpg +0 -0
  12. examples/2006733/2006733_08.jpg +0 -0
  13. examples/2006733/2006733_09.jpg +0 -0
  14. examples/2006733/2006733_10.jpg +0 -0
  15. examples/2006733/2006733_11.jpg +0 -0
  16. examples/2006733/2006733_12.jpg +0 -0
  17. examples/2006733/2006733_13.jpg +0 -0
  18. examples/2006733/2006733_14.jpg +0 -0
  19. examples/2006733/2006733_15.jpg +0 -0
  20. examples/3005033.mp4 +0 -0
  21. examples/7004180.mp4 +0 -0
  22. examples/a400480.mp4 +0 -0
  23. examples/a500010.mp4 +0 -0
  24. examples/a500251.mp4 +0 -0
  25. examples/all_subsets.json +475 -1
  26. examples/b304986.mp4 +0 -0
  27. examples/b402727.mp4 +0 -0
  28. examples/b404675.mp4 +0 -0
  29. examples/d401950.mp4 +0 -0
  30. examples/d500506.mp4 +0 -0
  31. examples/d500937.mp4 +0 -0
  32. examples/data_subset.json +0 -368
  33. examples/image1.jpg +0 -0
  34. examples/image10.jpg +0 -0
  35. examples/image11.jpg +0 -0
  36. examples/image12.jpg +0 -0
  37. examples/image13.jpg +0 -0
  38. examples/image14.jpg +0 -0
  39. examples/image15.jpg +0 -0
  40. examples/image2.jpg +0 -0
  41. examples/image3.jpg +0 -0
  42. examples/image4.jpg +0 -0
  43. examples/image5.jpg +0 -0
  44. examples/image6.jpg +0 -0
  45. examples/image7.jpg +0 -0
  46. examples/image8.jpg +0 -0
  47. examples/image9.jpg +0 -0
  48. examples/r000228/r000228_00.jpg +0 -0
  49. examples/r000228/r000228_01.jpg +0 -0
  50. examples/r000228/r000228_02.jpg +0 -0
.gitignore CHANGED
@@ -156,4 +156,6 @@ cython_debug/
156
  # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
 
 
 
156
  # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+
161
+ /.vscode
app.py CHANGED
@@ -3,13 +3,16 @@ import spaces
3
  import os
4
  import time
5
  import json
 
 
6
  from PIL import Image
7
  import functools
8
  from transformers import AutoProcessor, Idefics2ForConditionalGeneration
9
  from models.conversation import conv_templates
10
  from typing import List
11
- processor = AutoProcessor.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
12
- model = Idefics2ForConditionalGeneration.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
 
13
  conv_template = conv_templates["idefics_2"]
14
 
15
  with open("./examples/all_subsets.json", 'r') as f:
@@ -18,8 +21,33 @@ with open("./examples/all_subsets.json", 'r') as f:
18
  for item in examples:
19
  video_id = item['images'][0].split("_")[0]
20
  item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
21
-
22
- prompt = "Suppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the beautiful girl, long hair,walk on the sity street, red cloth ,\nall the frames of video are as follows: \n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  @spaces.GPU
24
  def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
25
  global processor, model
@@ -32,6 +60,14 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
32
 
33
  idefics_2_message = []
34
  cur_img_idx = 0
 
 
 
 
 
 
 
 
35
 
36
  print(history)
37
  for i, message in enumerate(history):
@@ -41,6 +77,11 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
41
  "content": []
42
  })
43
  message_text = message["text"]
 
 
 
 
 
44
  num_image_tokens_in_text = message_text.count("<image>")
45
  if num_image_tokens_in_text > 0:
46
  sub_texts = [x.strip() for x in message_text.split("<image>")]
@@ -72,185 +113,111 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
72
  print(idefics_2_message)
73
  prompt = processor.apply_chat_template(idefics_2_message, add_generation_prompt=True)
74
 
75
- images = [Image.open(x) for x in images]
76
  inputs = processor(text=prompt, images=images, return_tensors="pt")
77
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
78
  outputs = model.generate(**inputs, max_new_tokens=1024)
79
  generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
80
  return generated_text
81
 
82
- def enable_next_image(uploaded_images, image):
83
- uploaded_images.append(image)
84
- return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
85
-
86
- def add_message(history, message):
87
- if message["files"]:
88
- for file in message["files"]:
89
- history.append([(file,), None])
90
- if message["text"]:
91
- history.append([message["text"], None])
92
- return history, gr.MultimodalTextbox(value=None)
93
-
94
- def print_like_dislike(x: gr.LikeData):
95
- print(x.index, x.value, x.liked)
96
-
97
-
98
- def get_chat_history(history):
99
- chat_history = []
100
- user_role = conv_template.roles[0]
101
- assistant_role = conv_template.roles[1]
102
- for i, message in enumerate(history):
103
- if isinstance(message[0], str):
104
- chat_history.append({"role": user_role, "text": message[0]})
105
- if i != len(history) - 1:
106
- assert message[1], "The bot message is not provided, internal error"
107
- chat_history.append({"role": assistant_role, "text": message[1]})
108
- else:
109
- assert not message[1], "the bot message internal error, get: {}".format(message[1])
110
- chat_history.append({"role": assistant_role, "text": ""})
111
- return chat_history
112
-
113
-
114
- def get_chat_images(history):
115
- images = []
116
- for message in history:
117
- if isinstance(message[0], tuple):
118
- images.extend(message[0])
119
- return images
120
-
121
-
122
- def bot(history):
123
- cur_messages = {"text": "", "images": []}
124
- for message in history[::-1]:
125
- if message[1]:
126
- break
127
- if isinstance(message[0], str):
128
- cur_messages["text"] = message[0] + " " + cur_messages["text"]
129
- elif isinstance(message[0], tuple):
130
- cur_messages["images"].extend(message[0])
131
- cur_messages["text"] = cur_messages["text"].strip()
132
- cur_messages["images"] = cur_messages["images"][::-1]
133
- if not cur_messages["text"]:
134
- raise gr.Error("Please enter a message")
135
- if cur_messages['text'].count("<image>") < len(cur_messages['images']):
136
- gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
137
- cur_messages['text'] += "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>"))
138
- history[-1][0] = cur_messages["text"]
139
- if cur_messages['text'].count("<image>") > len(cur_messages['images']):
140
- gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
141
- cur_messages['text'] = cur_messages['text'][::-1].replace("<image>"[::-1], "", cur_messages['text'].count("<image>") - len(cur_messages['images']))[::-1]
142
- history[-1][0] = cur_messages["text"]
143
-
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- chat_history = get_chat_history(history)
147
- chat_images = get_chat_images(history)
148
-
149
- generation_kwargs = {
150
- "max_new_tokens": 4096,
151
- "num_beams": 1,
152
- "do_sample": False
153
- }
154
-
155
- response = generate(None, chat_images, chat_history, **generation_kwargs)
156
- history[-1][1] = response
157
- return history
158
- # for _output in response:
159
- # history[-1][1] = _output
160
- # time.sleep(0.05)
161
- # yield history
162
 
163
- def get_images(video_folder:str):
164
- """
165
- video folder contains images files like {video_folder_name}_00.jpg, {video_folder_name}_01.jpg, ...
166
- """
167
- images = []
168
- for file in os.listdir(video_folder):
169
- if file.endswith(".jpg"):
170
- images.append(Image.open(os.path.join(video_folder, file)))
171
- # sort images by name
172
- images.sort(key=lambda x: int(x.filename.split("_")[-1].split(".")[0]))
173
- return images
174
-
175
  def build_demo():
176
  with gr.Blocks() as demo:
177
-
178
- gr.Markdown(""" # Mantis
179
- Mantis is a multimodal conversational AI model that can chat with users about images and text. It's optimized for multi-image reasoning, where inverleaved text and images can be used to generate responses.
180
-
181
- ### [Paper](https://arxiv.org/abs/2405.01483) | [Github](https://github.com/TIGER-AI-Lab/Mantis) | [Models](https://huggingface.co/collections/TIGER-Lab/mantis-6619b0834594c878cdb1d6e4) | [Dataset](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct) | [Website](https://tiger-ai-lab.github.io/Mantis/)
182
- """)
183
-
184
- gr.Markdown("""## Chat with Mantis
185
- Mantis supports interleaved text-image input format, where you can simply use the placeholder `<image>` to indicate the position of uploaded images.
186
- The model is optimized for multi-image reasoning, while preserving the ability to chat about text and images in a single conversation.
187
  """)
188
-
189
- chatbot = gr.Chatbot(line_breaks=True)
190
- chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload images. Please use <image> to indicate the position of uploaded images", show_label=True)
191
-
192
- chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
193
-
194
- """
195
- with gr.Accordion(label='Advanced options', open=False):
196
- temperature = gr.Slider(
197
- label='Temperature',
198
- minimum=0.1,
199
- maximum=2.0,
200
- step=0.1,
201
- value=0.2,
202
- interactive=True
203
- )
204
- top_p = gr.Slider(
205
- label='Top-p',
206
- minimum=0.05,
207
- maximum=1.0,
208
- step=0.05,
209
- value=1.0,
210
- interactive=True
211
- )
212
- """
213
-
214
- bot_msg = chat_msg.success(bot, chatbot, chatbot, api_name="bot_response")
215
-
216
- chatbot.like(print_like_dislike, None, None)
217
-
218
  with gr.Row():
219
- send_button = gr.Button("Send")
220
- clear_button = gr.ClearButton([chatbot, chat_input])
221
-
222
- send_button.click(
223
- add_message, [chatbot, chat_input], [chatbot, chat_input]
224
- ).then(
225
- bot, chatbot, chatbot, api_name="bot_response"
 
 
 
226
  )
227
- dummy_id = gr.Textbox("dummy_id", label="dummy_id", visible=False)
228
- dummy_output = gr.Textbox("dummy_output", label="dummy_output", visible=False)
 
229
 
230
  gr.Examples(
231
- examples=[
 
232
  [
233
  item['id'],
234
- {
235
- "text": item['conversations'][0]['value'],
236
- "files": item['images']
237
- },
238
  item['conversations'][1]['value']
239
  ] for item in examples
240
  ],
241
- inputs=[dummy_id, chat_input, dummy_output],
242
  )
243
 
244
- gr.Markdown("""
245
- ## Citation
246
- ```
247
- @article{jiang2024mantis,
248
- title={MANTIS: Interleaved Multi-Image Instruction Tuning},
249
- author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
250
- journal={arXiv preprint arXiv:2405.01483},
251
- year={2024}
252
- }
253
- ```""")
254
  return demo
255
 
256
 
 
3
  import os
4
  import time
5
  import json
6
+ import numpy as np
7
+ import av
8
  from PIL import Image
9
  import functools
10
  from transformers import AutoProcessor, Idefics2ForConditionalGeneration
11
  from models.conversation import conv_templates
12
  from typing import List
13
+ processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-95k-mantis-2epoch_4096")
14
+ model = Idefics2ForConditionalGeneration.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-95k-mantis-2epoch_4096")
15
+ MAX_NUM_FRAMES = 24
16
  conv_template = conv_templates["idefics_2"]
17
 
18
  with open("./examples/all_subsets.json", 'r') as f:
 
21
  for item in examples:
22
  video_id = item['images'][0].split("_")[0]
23
  item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
24
+ item['video'] = os.path.join("./examples", item['video'])
25
+
26
+ VIDEO_EVAL_PROMPT = """
27
+ Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
28
+ please watch the following frames of a given video and see the text prompt for generating the video,
29
+ then give scores from 7 different dimensions:
30
+ (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
31
+ (2) object consistency, the consistency of objects or humans in video
32
+ (3) dynamic degree, the degree of dynamic changes
33
+ (4) motion smoothness, the smoothness of motion or movements
34
+ (5) text-to-video alignment, the alignment between the text prompt and the video content
35
+ (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
36
+ (7) overall score, the overall quality of the video
37
+ for each dimension, output a number from [1,2,3], in which '1' is 'Bad', '2' is 'Average', '3' is 'Good'.
38
+ Here is an output example:
39
+ visual quality: 3
40
+ object consistency: 2
41
+ dynamic degree: 2
42
+ motion smoothness: 1
43
+ text-to-video alignment: 1
44
+ factual consistency: 2
45
+ overall score: 1
46
+
47
+ For this video, the text prompt is "{text_prompt}",
48
+ all the frames of video are as follows:
49
+
50
+ """
51
  @spaces.GPU
52
  def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
53
  global processor, model
 
60
 
61
  idefics_2_message = []
62
  cur_img_idx = 0
63
+ cur_vid_idx = 0
64
+ all_videos = [x for x in images if isinstance(x, list)]
65
+ flatten_images = []
66
+ for x in images:
67
+ if isinstance(x, list):
68
+ flatten_images.extend(x)
69
+ else:
70
+ flatten_images.append(x)
71
 
72
  print(history)
73
  for i, message in enumerate(history):
 
77
  "content": []
78
  })
79
  message_text = message["text"]
80
+ num_video_tokens_in_text = message_text.count("<video>")
81
+ if num_video_tokens_in_text > 0:
82
+ for _ in range(num_video_tokens_in_text):
83
+ message_text = message_text.replace("<video>", "<image> " * len(all_videos[cur_vid_idx]), 1)
84
+ cur_vid_idx += 1
85
  num_image_tokens_in_text = message_text.count("<image>")
86
  if num_image_tokens_in_text > 0:
87
  sub_texts = [x.strip() for x in message_text.split("<image>")]
 
113
  print(idefics_2_message)
114
  prompt = processor.apply_chat_template(idefics_2_message, add_generation_prompt=True)
115
 
116
+ images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
117
  inputs = processor(text=prompt, images=images, return_tensors="pt")
118
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
119
  outputs = model.generate(**inputs, max_new_tokens=1024)
120
  generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
121
  return generated_text
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ def read_video_pyav(container, indices):
125
+ '''
126
+ Decode the video with PyAV decoder.
127
+
128
+ Args:
129
+ container (av.container.input.InputContainer): PyAV container.
130
+ indices (List[int]): List of frame indices to decode.
131
+
132
+ Returns:
133
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
134
+ '''
135
+ frames = []
136
+ container.seek(0)
137
+ start_index = indices[0]
138
+ end_index = indices[-1]
139
+ for i, frame in enumerate(container.decode(video=0)):
140
+ if i > end_index:
141
+ break
142
+ if i >= start_index and i in indices:
143
+ frames.append(frame)
144
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
145
+
146
+ def eval_video(prompt, video:str):
147
+ container = av.open(video)
148
+
149
+ # sample uniformly 8 frames from the video
150
+ total_frames = container.streams.video[0].frames
151
+ if total_frames > MAX_NUM_FRAMES:
152
+ indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
153
+ else:
154
+ indices = np.arange(total_frames)
155
+ video_frames = read_video_pyav(container, indices)
156
+
157
+ frames = [Image.fromarray(x) for x in video_frames]
158
 
159
+ eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
160
+ eval_prompt += "<video>"
161
+ user_role = conv_template.roles[0]
162
+ assistant_role = conv_template.roles[1]
163
+ chat_messages = [
164
+ {
165
+ "role": user_role,
166
+ "text": eval_prompt
167
+ },
168
+ {
169
+ "role": assistant_role,
170
+ "text": ""
171
+ }
172
+ ]
173
+ response = generate(None, [frames], chat_messages)
174
+ return response
175
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def build_demo():
177
  with gr.Blocks() as demo:
178
+ gr.Markdown("""
179
+ ## Video Evaluation
180
+ upload a video along with a text prompt when generating the video, this model will evaluate the video's quality from 7 different dimensions.
 
 
 
 
 
 
 
181
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  with gr.Row():
183
+ video = gr.Video(width=500, label="Video")
184
+ with gr.Column():
185
+ video_prompt = gr.Textbox(label="Text Prompt", interactive=False, lines=1)
186
+ with gr.Row():
187
+ eval_button = gr.Button("Evaluate Video")
188
+ clear_button = gr.ClearButton([video])
189
+ eval_result = gr.Textbox(label="Evaluation result", interactive=False, lines=7)
190
+
191
+ eval_button.click(
192
+ eval_video, [video_prompt, video], [eval_result]
193
  )
194
+
195
+ dummy_id = gr.Textbox("id", label="id", visible=False, min_width=50)
196
+ dummy_output = gr.Textbox("reference score", label="reference scores", visible=False, lines=7)
197
 
198
  gr.Examples(
199
+ examples=
200
+ [
201
  [
202
  item['id'],
203
+ item['prompt'],
204
+ item['video'],
 
 
205
  item['conversations'][1]['value']
206
  ] for item in examples
207
  ],
208
+ inputs=[dummy_id, video_prompt, video, dummy_output],
209
  )
210
 
211
+ # gr.Markdown("""
212
+ # ## Citation
213
+ # ```
214
+ # @article{jiang2024mantis,
215
+ # title={MANTIS: Interleaved Multi-Image Instruction Tuning},
216
+ # author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
217
+ # journal={arXiv preprint arXiv:2405.01483},
218
+ # year={2024}
219
+ # }
220
+ # ```""")
221
  return demo
222
 
223
 
examples/1006309.mp4 ADDED
Binary file (221 kB). View file
 
examples/2006733/2006733_00.jpg DELETED
Binary file (12.6 kB)
 
examples/2006733/2006733_01.jpg DELETED
Binary file (12.5 kB)
 
examples/2006733/2006733_02.jpg DELETED
Binary file (12.5 kB)
 
examples/2006733/2006733_03.jpg DELETED
Binary file (12.5 kB)
 
examples/2006733/2006733_04.jpg DELETED
Binary file (13.3 kB)
 
examples/2006733/2006733_05.jpg DELETED
Binary file (13.4 kB)
 
examples/2006733/2006733_06.jpg DELETED
Binary file (13.3 kB)
 
examples/2006733/2006733_07.jpg DELETED
Binary file (13.2 kB)
 
examples/2006733/2006733_08.jpg DELETED
Binary file (13.5 kB)
 
examples/2006733/2006733_09.jpg DELETED
Binary file (13.5 kB)
 
examples/2006733/2006733_10.jpg DELETED
Binary file (13.7 kB)
 
examples/2006733/2006733_11.jpg DELETED
Binary file (13.5 kB)
 
examples/2006733/2006733_12.jpg DELETED
Binary file (12.7 kB)
 
examples/2006733/2006733_13.jpg DELETED
Binary file (12.7 kB)
 
examples/2006733/2006733_14.jpg DELETED
Binary file (12.7 kB)
 
examples/2006733/2006733_15.jpg DELETED
Binary file (12.7 kB)
 
examples/3005033.mp4 ADDED
Binary file (80.1 kB). View file
 
examples/7004180.mp4 ADDED
Binary file (26.4 kB). View file
 
examples/a400480.mp4 ADDED
Binary file (27.1 kB). View file
 
examples/a500010.mp4 ADDED
Binary file (59.7 kB). View file
 
examples/a500251.mp4 ADDED
Binary file (135 kB). View file
 
examples/all_subsets.json CHANGED
@@ -1 +1,475 @@
1
- [{"id": "real_r003679", "images": ["r003679_00.jpg", "r003679_01.jpg", "r003679_02.jpg", "r003679_03.jpg", "r003679_04.jpg", "r003679_05.jpg", "r003679_06.jpg", "r003679_07.jpg", "r003679_08.jpg", "r003679_09.jpg", "r003679_10.jpg", "r003679_11.jpg", "r003679_12.jpg", "r003679_13.jpg", "r003679_14.jpg", "r003679_15.jpg", "r003679_16.jpg", "r003679_17.jpg", "r003679_18.jpg", "r003679_19.jpg", "r003679_20.jpg", "r003679_21.jpg", "r003679_22.jpg", "r003679_23.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "real_r100916", "images": ["r100916_00.jpg", "r100916_01.jpg", "r100916_02.jpg", "r100916_03.jpg", "r100916_04.jpg", "r100916_05.jpg", "r100916_06.jpg", "r100916_07.jpg", "r100916_08.jpg", "r100916_09.jpg", "r100916_10.jpg", "r100916_11.jpg", "r100916_12.jpg", "r100916_13.jpg", "r100916_14.jpg", "r100916_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "real_r004061", "images": ["r004061_00.jpg", "r004061_01.jpg", "r004061_02.jpg", "r004061_03.jpg", "r004061_04.jpg", "r004061_05.jpg", "r004061_06.jpg", "r004061_07.jpg", "r004061_08.jpg", "r004061_09.jpg", "r004061_10.jpg", "r004061_11.jpg", "r004061_12.jpg", "r004061_13.jpg", "r004061_14.jpg", "r004061_15.jpg", "r004061_16.jpg", "r004061_17.jpg", "r004061_18.jpg", "r004061_19.jpg", "r004061_20.jpg", "r004061_21.jpg", "r004061_22.jpg", "r004061_23.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "worsen_gen_b404675", "images": ["b404675_00.jpg", "b404675_01.jpg", "b404675_02.jpg", "b404675_03.jpg", "b404675_04.jpg", "b404675_05.jpg", "b404675_06.jpg", "b404675_07.jpg", "b404675_08.jpg", "b404675_09.jpg", "b404675_10.jpg", "b404675_11.jpg", "b404675_12.jpg", "b404675_13.jpg", "b404675_14.jpg", "b404675_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is After a few years, Ghulam Muhammad became the owner of his business ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"}]}, {"id": "worsen_gen_b402727", "images": ["b402727_00.jpg", "b402727_01.jpg", "b402727_02.jpg", "b402727_03.jpg", "b402727_04.jpg", "b402727_05.jpg", "b402727_06.jpg", "b402727_07.jpg", "b402727_08.jpg", "b402727_09.jpg", "b402727_10.jpg", "b402727_11.jpg", "b402727_12.jpg", "b402727_13.jpg", "b402727_14.jpg", "b402727_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is view down to the path in the field, moving forward, night, dark. ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 1\n"}]}, {"id": "worsen_gen_b304986", "images": ["b304986_00.jpg", "b304986_01.jpg", "b304986_02.jpg", "b304986_03.jpg", "b304986_04.jpg", "b304986_05.jpg", "b304986_06.jpg", "b304986_07.jpg", "b304986_08.jpg", "b304986_09.jpg", "b304986_10.jpg", "b304986_11.jpg", "b304986_12.jpg", "b304986_13.jpg", "b304986_14.jpg", "b304986_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is Old Philippino man, close up, subtle smile, teats of joy, in old wood room ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"}]}, {"id": "static_d500937", "images": ["d500937_00.jpg", "d500937_01.jpg", "d500937_02.jpg", "d500937_03.jpg", "d500937_04.jpg", "d500937_05.jpg", "d500937_06.jpg", "d500937_07.jpg", "d500937_08.jpg", "d500937_09.jpg", "d500937_10.jpg", "d500937_11.jpg", "d500937_12.jpg", "d500937_13.jpg", "d500937_14.jpg", "d500937_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is write y in the garden with flowers ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "static_d500506", "images": ["d500506_00.jpg", "d500506_01.jpg", "d500506_02.jpg", "d500506_03.jpg", "d500506_04.jpg", "d500506_05.jpg", "d500506_06.jpg", "d500506_07.jpg", "d500506_08.jpg", "d500506_09.jpg", "d500506_10.jpg", "d500506_11.jpg", "d500506_12.jpg", "d500506_13.jpg", "d500506_14.jpg", "d500506_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a boy paiting a beautiful landscape ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "static_d401950", "images": ["d401950_00.jpg", "d401950_01.jpg", "d401950_02.jpg", "d401950_03.jpg", "d401950_04.jpg", "d401950_05.jpg", "d401950_06.jpg", "d401950_07.jpg", "d401950_08.jpg", "d401950_09.jpg", "d401950_10.jpg", "d401950_11.jpg", "d401950_12.jpg", "d401950_13.jpg", "d401950_14.jpg", "d401950_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is sneaker with exclusive design worth 1 billion ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "insf_a500251", "images": ["a500251_00.jpg", "a500251_01.jpg", "a500251_02.jpg", "a500251_03.jpg", "a500251_04.jpg", "a500251_05.jpg", "a500251_06.jpg", "a500251_07.jpg", "a500251_08.jpg", "a500251_09.jpg", "a500251_10.jpg", "a500251_11.jpg", "a500251_12.jpg", "a500251_13.jpg", "a500251_14.jpg", "a500251_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is :sci fi war in the moon ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "insf_a400480", "images": ["a400480_00.jpg", "a400480_01.jpg", "a400480_02.jpg", "a400480_03.jpg", "a400480_04.jpg", "a400480_05.jpg", "a400480_06.jpg", "a400480_07.jpg", "a400480_08.jpg", "a400480_09.jpg", "a400480_10.jpg", "a400480_11.jpg", "a400480_12.jpg", "a400480_13.jpg", "a400480_14.jpg", "a400480_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is for a dog accessories brand a welcome image: brand name: hot doggies ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "insf_a500010", "images": ["a500010_00.jpg", "a500010_01.jpg", "a500010_02.jpg", "a500010_03.jpg", "a500010_04.jpg", "a500010_05.jpg", "a500010_06.jpg", "a500010_07.jpg", "a500010_08.jpg", "a500010_09.jpg", "a500010_10.jpg", "a500010_11.jpg", "a500010_12.jpg", "a500010_13.jpg", "a500010_14.jpg", "a500010_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a druid in the forest cinematic ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "lab_3005033", "images": ["3005033_00.jpg", "3005033_01.jpg", "3005033_02.jpg", "3005033_03.jpg", "3005033_04.jpg", "3005033_05.jpg", "3005033_06.jpg", "3005033_07.jpg", "3005033_08.jpg", "3005033_09.jpg", "3005033_10.jpg", "3005033_11.jpg", "3005033_12.jpg", "3005033_13.jpg", "3005033_14.jpg", "3005033_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a robot vacuum traveling through southeast asian countries ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 2\n"}]}, {"id": "lab_7004180", "images": ["7004180_00.jpg", "7004180_01.jpg", "7004180_02.jpg", "7004180_03.jpg", "7004180_04.jpg", "7004180_05.jpg", "7004180_06.jpg", "7004180_07.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is jiaxuan is the most beautiful \uff0celegant\uff0ccharming girl in the world. billions of boys fell in love with her ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "lab_1006309", "images": ["1006309_00.jpg", "1006309_01.jpg", "1006309_02.jpg", "1006309_03.jpg", "1006309_04.jpg", "1006309_05.jpg", "1006309_06.jpg", "1006309_07.jpg", "1006309_08.jpg", "1006309_09.jpg", "1006309_10.jpg", "1006309_11.jpg", "1006309_12.jpg", "1006309_13.jpg", "1006309_14.jpg", "1006309_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is scary halloween skeletons are having a conversation with each other ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 3\n factual consistency: 2\n overall score: 3\n"}]}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "real_r003679",
4
+ "images": [
5
+ "r003679_00.jpg",
6
+ "r003679_01.jpg",
7
+ "r003679_02.jpg",
8
+ "r003679_03.jpg",
9
+ "r003679_04.jpg",
10
+ "r003679_05.jpg",
11
+ "r003679_06.jpg",
12
+ "r003679_07.jpg",
13
+ "r003679_08.jpg",
14
+ "r003679_09.jpg",
15
+ "r003679_10.jpg",
16
+ "r003679_11.jpg",
17
+ "r003679_12.jpg",
18
+ "r003679_13.jpg",
19
+ "r003679_14.jpg",
20
+ "r003679_15.jpg",
21
+ "r003679_16.jpg",
22
+ "r003679_17.jpg",
23
+ "r003679_18.jpg",
24
+ "r003679_19.jpg",
25
+ "r003679_20.jpg",
26
+ "r003679_21.jpg",
27
+ "r003679_22.jpg",
28
+ "r003679_23.jpg"
29
+ ],
30
+ "conversations": [
31
+ {
32
+ "from": "human",
33
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
34
+ },
35
+ {
36
+ "from": "gpt",
37
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
38
+ }
39
+ ]
40
+ },
41
+ {
42
+ "id": "real_r100916",
43
+ "images": [
44
+ "r100916_00.jpg",
45
+ "r100916_01.jpg",
46
+ "r100916_02.jpg",
47
+ "r100916_03.jpg",
48
+ "r100916_04.jpg",
49
+ "r100916_05.jpg",
50
+ "r100916_06.jpg",
51
+ "r100916_07.jpg",
52
+ "r100916_08.jpg",
53
+ "r100916_09.jpg",
54
+ "r100916_10.jpg",
55
+ "r100916_11.jpg",
56
+ "r100916_12.jpg",
57
+ "r100916_13.jpg",
58
+ "r100916_14.jpg",
59
+ "r100916_15.jpg"
60
+ ],
61
+ "conversations": [
62
+ {
63
+ "from": "human",
64
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
65
+ },
66
+ {
67
+ "from": "gpt",
68
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
69
+ }
70
+ ]
71
+ },
72
+ {
73
+ "id": "real_r004061",
74
+ "images": [
75
+ "r004061_00.jpg",
76
+ "r004061_01.jpg",
77
+ "r004061_02.jpg",
78
+ "r004061_03.jpg",
79
+ "r004061_04.jpg",
80
+ "r004061_05.jpg",
81
+ "r004061_06.jpg",
82
+ "r004061_07.jpg",
83
+ "r004061_08.jpg",
84
+ "r004061_09.jpg",
85
+ "r004061_10.jpg",
86
+ "r004061_11.jpg",
87
+ "r004061_12.jpg",
88
+ "r004061_13.jpg",
89
+ "r004061_14.jpg",
90
+ "r004061_15.jpg",
91
+ "r004061_16.jpg",
92
+ "r004061_17.jpg",
93
+ "r004061_18.jpg",
94
+ "r004061_19.jpg",
95
+ "r004061_20.jpg",
96
+ "r004061_21.jpg",
97
+ "r004061_22.jpg",
98
+ "r004061_23.jpg"
99
+ ],
100
+ "conversations": [
101
+ {
102
+ "from": "human",
103
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
104
+ },
105
+ {
106
+ "from": "gpt",
107
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
108
+ }
109
+ ]
110
+ },
111
+ {
112
+ "id": "worsen_gen_b404675",
113
+ "images": [
114
+ "b404675_00.jpg",
115
+ "b404675_01.jpg",
116
+ "b404675_02.jpg",
117
+ "b404675_03.jpg",
118
+ "b404675_04.jpg",
119
+ "b404675_05.jpg",
120
+ "b404675_06.jpg",
121
+ "b404675_07.jpg",
122
+ "b404675_08.jpg",
123
+ "b404675_09.jpg",
124
+ "b404675_10.jpg",
125
+ "b404675_11.jpg",
126
+ "b404675_12.jpg",
127
+ "b404675_13.jpg",
128
+ "b404675_14.jpg",
129
+ "b404675_15.jpg"
130
+ ],
131
+ "conversations": [
132
+ {
133
+ "from": "human",
134
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is After a few years, Ghulam Muhammad became the owner of his business ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
135
+ },
136
+ {
137
+ "from": "gpt",
138
+ "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"
139
+ }
140
+ ]
141
+ },
142
+ {
143
+ "id": "worsen_gen_b402727",
144
+ "images": [
145
+ "b402727_00.jpg",
146
+ "b402727_01.jpg",
147
+ "b402727_02.jpg",
148
+ "b402727_03.jpg",
149
+ "b402727_04.jpg",
150
+ "b402727_05.jpg",
151
+ "b402727_06.jpg",
152
+ "b402727_07.jpg",
153
+ "b402727_08.jpg",
154
+ "b402727_09.jpg",
155
+ "b402727_10.jpg",
156
+ "b402727_11.jpg",
157
+ "b402727_12.jpg",
158
+ "b402727_13.jpg",
159
+ "b402727_14.jpg",
160
+ "b402727_15.jpg"
161
+ ],
162
+ "conversations": [
163
+ {
164
+ "from": "human",
165
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is view down to the path in the field, moving forward, night, dark. ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
166
+ },
167
+ {
168
+ "from": "gpt",
169
+ "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 1\n"
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "id": "worsen_gen_b304986",
175
+ "images": [
176
+ "b304986_00.jpg",
177
+ "b304986_01.jpg",
178
+ "b304986_02.jpg",
179
+ "b304986_03.jpg",
180
+ "b304986_04.jpg",
181
+ "b304986_05.jpg",
182
+ "b304986_06.jpg",
183
+ "b304986_07.jpg",
184
+ "b304986_08.jpg",
185
+ "b304986_09.jpg",
186
+ "b304986_10.jpg",
187
+ "b304986_11.jpg",
188
+ "b304986_12.jpg",
189
+ "b304986_13.jpg",
190
+ "b304986_14.jpg",
191
+ "b304986_15.jpg"
192
+ ],
193
+ "conversations": [
194
+ {
195
+ "from": "human",
196
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is Old Philippino man, close up, subtle smile, teats of joy, in old wood room ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
197
+ },
198
+ {
199
+ "from": "gpt",
200
+ "value": "visual quality: 3\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"
201
+ }
202
+ ]
203
+ },
204
+ {
205
+ "id": "static_d500937",
206
+ "images": [
207
+ "d500937_00.jpg",
208
+ "d500937_01.jpg",
209
+ "d500937_02.jpg",
210
+ "d500937_03.jpg",
211
+ "d500937_04.jpg",
212
+ "d500937_05.jpg",
213
+ "d500937_06.jpg",
214
+ "d500937_07.jpg",
215
+ "d500937_08.jpg",
216
+ "d500937_09.jpg",
217
+ "d500937_10.jpg",
218
+ "d500937_11.jpg",
219
+ "d500937_12.jpg",
220
+ "d500937_13.jpg",
221
+ "d500937_14.jpg",
222
+ "d500937_15.jpg"
223
+ ],
224
+ "conversations": [
225
+ {
226
+ "from": "human",
227
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is write y in the garden with flowers ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
228
+ },
229
+ {
230
+ "from": "gpt",
231
+ "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
232
+ }
233
+ ]
234
+ },
235
+ {
236
+ "id": "static_d500506",
237
+ "images": [
238
+ "d500506_00.jpg",
239
+ "d500506_01.jpg",
240
+ "d500506_02.jpg",
241
+ "d500506_03.jpg",
242
+ "d500506_04.jpg",
243
+ "d500506_05.jpg",
244
+ "d500506_06.jpg",
245
+ "d500506_07.jpg",
246
+ "d500506_08.jpg",
247
+ "d500506_09.jpg",
248
+ "d500506_10.jpg",
249
+ "d500506_11.jpg",
250
+ "d500506_12.jpg",
251
+ "d500506_13.jpg",
252
+ "d500506_14.jpg",
253
+ "d500506_15.jpg"
254
+ ],
255
+ "conversations": [
256
+ {
257
+ "from": "human",
258
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a boy paiting a beautiful landscape ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
259
+ },
260
+ {
261
+ "from": "gpt",
262
+ "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
263
+ }
264
+ ]
265
+ },
266
+ {
267
+ "id": "static_d401950",
268
+ "images": [
269
+ "d401950_00.jpg",
270
+ "d401950_01.jpg",
271
+ "d401950_02.jpg",
272
+ "d401950_03.jpg",
273
+ "d401950_04.jpg",
274
+ "d401950_05.jpg",
275
+ "d401950_06.jpg",
276
+ "d401950_07.jpg",
277
+ "d401950_08.jpg",
278
+ "d401950_09.jpg",
279
+ "d401950_10.jpg",
280
+ "d401950_11.jpg",
281
+ "d401950_12.jpg",
282
+ "d401950_13.jpg",
283
+ "d401950_14.jpg",
284
+ "d401950_15.jpg"
285
+ ],
286
+ "conversations": [
287
+ {
288
+ "from": "human",
289
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is sneaker with exclusive design worth 1 billion ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
290
+ },
291
+ {
292
+ "from": "gpt",
293
+ "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
294
+ }
295
+ ]
296
+ },
297
+ {
298
+ "id": "insf_a500251",
299
+ "images": [
300
+ "a500251_00.jpg",
301
+ "a500251_01.jpg",
302
+ "a500251_02.jpg",
303
+ "a500251_03.jpg",
304
+ "a500251_04.jpg",
305
+ "a500251_05.jpg",
306
+ "a500251_06.jpg",
307
+ "a500251_07.jpg",
308
+ "a500251_08.jpg",
309
+ "a500251_09.jpg",
310
+ "a500251_10.jpg",
311
+ "a500251_11.jpg",
312
+ "a500251_12.jpg",
313
+ "a500251_13.jpg",
314
+ "a500251_14.jpg",
315
+ "a500251_15.jpg"
316
+ ],
317
+ "conversations": [
318
+ {
319
+ "from": "human",
320
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is :sci fi war in the moon ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
321
+ },
322
+ {
323
+ "from": "gpt",
324
+ "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
325
+ }
326
+ ]
327
+ },
328
+ {
329
+ "id": "insf_a400480",
330
+ "images": [
331
+ "a400480_00.jpg",
332
+ "a400480_01.jpg",
333
+ "a400480_02.jpg",
334
+ "a400480_03.jpg",
335
+ "a400480_04.jpg",
336
+ "a400480_05.jpg",
337
+ "a400480_06.jpg",
338
+ "a400480_07.jpg",
339
+ "a400480_08.jpg",
340
+ "a400480_09.jpg",
341
+ "a400480_10.jpg",
342
+ "a400480_11.jpg",
343
+ "a400480_12.jpg",
344
+ "a400480_13.jpg",
345
+ "a400480_14.jpg",
346
+ "a400480_15.jpg"
347
+ ],
348
+ "conversations": [
349
+ {
350
+ "from": "human",
351
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is for a dog accessories brand a welcome image: brand name: hot doggies ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
352
+ },
353
+ {
354
+ "from": "gpt",
355
+ "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
356
+ }
357
+ ]
358
+ },
359
+ {
360
+ "id": "insf_a500010",
361
+ "images": [
362
+ "a500010_00.jpg",
363
+ "a500010_01.jpg",
364
+ "a500010_02.jpg",
365
+ "a500010_03.jpg",
366
+ "a500010_04.jpg",
367
+ "a500010_05.jpg",
368
+ "a500010_06.jpg",
369
+ "a500010_07.jpg",
370
+ "a500010_08.jpg",
371
+ "a500010_09.jpg",
372
+ "a500010_10.jpg",
373
+ "a500010_11.jpg",
374
+ "a500010_12.jpg",
375
+ "a500010_13.jpg",
376
+ "a500010_14.jpg",
377
+ "a500010_15.jpg"
378
+ ],
379
+ "conversations": [
380
+ {
381
+ "from": "human",
382
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a druid in the forest cinematic ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
383
+ },
384
+ {
385
+ "from": "gpt",
386
+ "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
387
+ }
388
+ ]
389
+ },
390
+ {
391
+ "id": "lab_3005033",
392
+ "images": [
393
+ "3005033_00.jpg",
394
+ "3005033_01.jpg",
395
+ "3005033_02.jpg",
396
+ "3005033_03.jpg",
397
+ "3005033_04.jpg",
398
+ "3005033_05.jpg",
399
+ "3005033_06.jpg",
400
+ "3005033_07.jpg",
401
+ "3005033_08.jpg",
402
+ "3005033_09.jpg",
403
+ "3005033_10.jpg",
404
+ "3005033_11.jpg",
405
+ "3005033_12.jpg",
406
+ "3005033_13.jpg",
407
+ "3005033_14.jpg",
408
+ "3005033_15.jpg"
409
+ ],
410
+ "conversations": [
411
+ {
412
+ "from": "human",
413
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a robot vacuum traveling through southeast asian countries ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
414
+ },
415
+ {
416
+ "from": "gpt",
417
+ "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 2\n"
418
+ }
419
+ ]
420
+ },
421
+ {
422
+ "id": "lab_7004180",
423
+ "images": [
424
+ "7004180_00.jpg",
425
+ "7004180_01.jpg",
426
+ "7004180_02.jpg",
427
+ "7004180_03.jpg",
428
+ "7004180_04.jpg",
429
+ "7004180_05.jpg",
430
+ "7004180_06.jpg",
431
+ "7004180_07.jpg"
432
+ ],
433
+ "conversations": [
434
+ {
435
+ "from": "human",
436
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is jiaxuan is the most beautiful \uff0celegant\uff0ccharming girl in the world. billions of boys fell in love with her ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> \n"
437
+ },
438
+ {
439
+ "from": "gpt",
440
+ "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
441
+ }
442
+ ]
443
+ },
444
+ {
445
+ "id": "lab_1006309",
446
+ "images": [
447
+ "1006309_00.jpg",
448
+ "1006309_01.jpg",
449
+ "1006309_02.jpg",
450
+ "1006309_03.jpg",
451
+ "1006309_04.jpg",
452
+ "1006309_05.jpg",
453
+ "1006309_06.jpg",
454
+ "1006309_07.jpg",
455
+ "1006309_08.jpg",
456
+ "1006309_09.jpg",
457
+ "1006309_10.jpg",
458
+ "1006309_11.jpg",
459
+ "1006309_12.jpg",
460
+ "1006309_13.jpg",
461
+ "1006309_14.jpg",
462
+ "1006309_15.jpg"
463
+ ],
464
+ "conversations": [
465
+ {
466
+ "from": "human",
467
+ "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is scary halloween skeletons are having a conversation with each other ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
468
+ },
469
+ {
470
+ "from": "gpt",
471
+ "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 3\n factual consistency: 2\n overall score: 3\n"
472
+ }
473
+ ]
474
+ }
475
+ ]
examples/b304986.mp4 ADDED
Binary file (53.6 kB). View file
 
examples/b402727.mp4 ADDED
Binary file (43.7 kB). View file
 
examples/b404675.mp4 ADDED
Binary file (218 kB). View file
 
examples/d401950.mp4 ADDED
Binary file (13 kB). View file
 
examples/d500506.mp4 ADDED
Binary file (53.2 kB). View file
 
examples/d500937.mp4 ADDED
Binary file (38.6 kB). View file
 
examples/data_subset.json DELETED
@@ -1,368 +0,0 @@
1
- [
2
- {
3
- "id": "r003679",
4
- "images": [
5
- "r003679_00.jpg",
6
- "r003679_01.jpg",
7
- "r003679_02.jpg",
8
- "r003679_03.jpg",
9
- "r003679_04.jpg",
10
- "r003679_05.jpg",
11
- "r003679_06.jpg",
12
- "r003679_07.jpg",
13
- "r003679_08.jpg",
14
- "r003679_09.jpg",
15
- "r003679_10.jpg",
16
- "r003679_11.jpg",
17
- "r003679_12.jpg",
18
- "r003679_13.jpg",
19
- "r003679_14.jpg",
20
- "r003679_15.jpg",
21
- "r003679_16.jpg",
22
- "r003679_17.jpg",
23
- "r003679_18.jpg",
24
- "r003679_19.jpg",
25
- "r003679_20.jpg",
26
- "r003679_21.jpg",
27
- "r003679_22.jpg",
28
- "r003679_23.jpg"
29
- ],
30
- "conversations": [
31
- {
32
- "from": "human",
33
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
34
- },
35
- {
36
- "from": "gpt",
37
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
38
- }
39
- ]
40
- },
41
- {
42
- "id": "r100916",
43
- "images": [
44
- "r100916_00.jpg",
45
- "r100916_01.jpg",
46
- "r100916_02.jpg",
47
- "r100916_03.jpg",
48
- "r100916_04.jpg",
49
- "r100916_05.jpg",
50
- "r100916_06.jpg",
51
- "r100916_07.jpg",
52
- "r100916_08.jpg",
53
- "r100916_09.jpg",
54
- "r100916_10.jpg",
55
- "r100916_11.jpg",
56
- "r100916_12.jpg",
57
- "r100916_13.jpg",
58
- "r100916_14.jpg",
59
- "r100916_15.jpg"
60
- ],
61
- "conversations": [
62
- {
63
- "from": "human",
64
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
65
- },
66
- {
67
- "from": "gpt",
68
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
69
- }
70
- ]
71
- },
72
- {
73
- "id": "r004061",
74
- "images": [
75
- "r004061_00.jpg",
76
- "r004061_01.jpg",
77
- "r004061_02.jpg",
78
- "r004061_03.jpg",
79
- "r004061_04.jpg",
80
- "r004061_05.jpg",
81
- "r004061_06.jpg",
82
- "r004061_07.jpg",
83
- "r004061_08.jpg",
84
- "r004061_09.jpg",
85
- "r004061_10.jpg",
86
- "r004061_11.jpg",
87
- "r004061_12.jpg",
88
- "r004061_13.jpg",
89
- "r004061_14.jpg",
90
- "r004061_15.jpg",
91
- "r004061_16.jpg",
92
- "r004061_17.jpg",
93
- "r004061_18.jpg",
94
- "r004061_19.jpg",
95
- "r004061_20.jpg",
96
- "r004061_21.jpg",
97
- "r004061_22.jpg",
98
- "r004061_23.jpg"
99
- ],
100
- "conversations": [
101
- {
102
- "from": "human",
103
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
104
- },
105
- {
106
- "from": "gpt",
107
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
108
- }
109
- ]
110
- },
111
- {
112
- "id": "r000228",
113
- "images": [
114
- "r000228_00.jpg",
115
- "r000228_01.jpg",
116
- "r000228_02.jpg",
117
- "r000228_03.jpg",
118
- "r000228_04.jpg",
119
- "r000228_05.jpg",
120
- "r000228_06.jpg",
121
- "r000228_07.jpg",
122
- "r000228_08.jpg",
123
- "r000228_09.jpg",
124
- "r000228_10.jpg",
125
- "r000228_11.jpg",
126
- "r000228_12.jpg",
127
- "r000228_13.jpg",
128
- "r000228_14.jpg",
129
- "r000228_15.jpg",
130
- "r000228_16.jpg",
131
- "r000228_17.jpg",
132
- "r000228_18.jpg",
133
- "r000228_19.jpg",
134
- "r000228_20.jpg",
135
- "r000228_21.jpg",
136
- "r000228_22.jpg",
137
- "r000228_23.jpg"
138
- ],
139
- "conversations": [
140
- {
141
- "from": "human",
142
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is two people get into sports car,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
143
- },
144
- {
145
- "from": "gpt",
146
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
147
- }
148
- ]
149
- },
150
- {
151
- "id": "r100887",
152
- "images": [
153
- "r100887_00.jpg",
154
- "r100887_01.jpg",
155
- "r100887_02.jpg",
156
- "r100887_03.jpg",
157
- "r100887_04.jpg",
158
- "r100887_05.jpg",
159
- "r100887_06.jpg",
160
- "r100887_07.jpg",
161
- "r100887_08.jpg",
162
- "r100887_09.jpg",
163
- "r100887_10.jpg",
164
- "r100887_11.jpg",
165
- "r100887_12.jpg",
166
- "r100887_13.jpg",
167
- "r100887_14.jpg",
168
- "r100887_15.jpg"
169
- ],
170
- "conversations": [
171
- {
172
- "from": "human",
173
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A person using a spatula to spread something on a pan.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
174
- },
175
- {
176
- "from": "gpt",
177
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
178
- }
179
- ]
180
- },
181
- {
182
- "id": "r104340",
183
- "images": [
184
- "r104340_00.jpg",
185
- "r104340_01.jpg",
186
- "r104340_02.jpg",
187
- "r104340_03.jpg",
188
- "r104340_04.jpg",
189
- "r104340_05.jpg",
190
- "r104340_06.jpg",
191
- "r104340_07.jpg",
192
- "r104340_08.jpg",
193
- "r104340_09.jpg",
194
- "r104340_10.jpg",
195
- "r104340_11.jpg",
196
- "r104340_12.jpg",
197
- "r104340_13.jpg",
198
- "r104340_14.jpg",
199
- "r104340_15.jpg"
200
- ],
201
- "conversations": [
202
- {
203
- "from": "human",
204
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A bartender is making an alcoholic drink by pouring a clear liquid into a glass with ice cubes.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
205
- },
206
- {
207
- "from": "gpt",
208
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
209
- }
210
- ]
211
- },
212
- {
213
- "id": "r001599",
214
- "images": [
215
- "r001599_00.jpg",
216
- "r001599_01.jpg",
217
- "r001599_02.jpg",
218
- "r001599_03.jpg",
219
- "r001599_04.jpg",
220
- "r001599_05.jpg",
221
- "r001599_06.jpg",
222
- "r001599_07.jpg",
223
- "r001599_08.jpg",
224
- "r001599_09.jpg",
225
- "r001599_10.jpg",
226
- "r001599_11.jpg",
227
- "r001599_12.jpg",
228
- "r001599_13.jpg",
229
- "r001599_14.jpg",
230
- "r001599_15.jpg",
231
- "r001599_16.jpg",
232
- "r001599_17.jpg",
233
- "r001599_18.jpg",
234
- "r001599_19.jpg",
235
- "r001599_20.jpg",
236
- "r001599_21.jpg",
237
- "r001599_22.jpg",
238
- "r001599_23.jpg"
239
- ],
240
- "conversations": [
241
- {
242
- "from": "human",
243
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man uses a camera,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
244
- },
245
- {
246
- "from": "gpt",
247
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
248
- }
249
- ]
250
- },
251
- {
252
- "id": "r101298",
253
- "images": [
254
- "r101298_00.jpg",
255
- "r101298_01.jpg",
256
- "r101298_02.jpg",
257
- "r101298_03.jpg",
258
- "r101298_04.jpg",
259
- "r101298_05.jpg",
260
- "r101298_06.jpg",
261
- "r101298_07.jpg",
262
- "r101298_08.jpg",
263
- "r101298_09.jpg",
264
- "r101298_10.jpg",
265
- "r101298_11.jpg",
266
- "r101298_12.jpg",
267
- "r101298_13.jpg",
268
- "r101298_14.jpg",
269
- "r101298_15.jpg",
270
- "r101298_16.jpg",
271
- "r101298_17.jpg",
272
- "r101298_18.jpg",
273
- "r101298_19.jpg",
274
- "r101298_20.jpg",
275
- "r101298_21.jpg",
276
- "r101298_22.jpg",
277
- "r101298_23.jpg"
278
- ],
279
- "conversations": [
280
- {
281
- "from": "human",
282
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A baseball game is being played in a stadium with a large crowd of people watching.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
283
- },
284
- {
285
- "from": "gpt",
286
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
287
- }
288
- ]
289
- },
290
- {
291
- "id": "r000688",
292
- "images": [
293
- "r000688_00.jpg",
294
- "r000688_01.jpg",
295
- "r000688_02.jpg",
296
- "r000688_03.jpg",
297
- "r000688_04.jpg",
298
- "r000688_05.jpg",
299
- "r000688_06.jpg",
300
- "r000688_07.jpg",
301
- "r000688_08.jpg",
302
- "r000688_09.jpg",
303
- "r000688_10.jpg",
304
- "r000688_11.jpg",
305
- "r000688_12.jpg",
306
- "r000688_13.jpg",
307
- "r000688_14.jpg",
308
- "r000688_15.jpg",
309
- "r000688_16.jpg",
310
- "r000688_17.jpg",
311
- "r000688_18.jpg",
312
- "r000688_19.jpg",
313
- "r000688_20.jpg",
314
- "r000688_21.jpg",
315
- "r000688_22.jpg",
316
- "r000688_23.jpg"
317
- ],
318
- "conversations": [
319
- {
320
- "from": "human",
321
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the child furthest to the right dips her hand into the bowl shared by the other children.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
322
- },
323
- {
324
- "from": "gpt",
325
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
326
- }
327
- ]
328
- },
329
- {
330
- "id": "r104350",
331
- "images": [
332
- "r104350_00.jpg",
333
- "r104350_01.jpg",
334
- "r104350_02.jpg",
335
- "r104350_03.jpg",
336
- "r104350_04.jpg",
337
- "r104350_05.jpg",
338
- "r104350_06.jpg",
339
- "r104350_07.jpg",
340
- "r104350_08.jpg",
341
- "r104350_09.jpg",
342
- "r104350_10.jpg",
343
- "r104350_11.jpg",
344
- "r104350_12.jpg",
345
- "r104350_13.jpg",
346
- "r104350_14.jpg",
347
- "r104350_15.jpg",
348
- "r104350_16.jpg",
349
- "r104350_17.jpg",
350
- "r104350_18.jpg",
351
- "r104350_19.jpg",
352
- "r104350_20.jpg",
353
- "r104350_21.jpg",
354
- "r104350_22.jpg",
355
- "r104350_23.jpg"
356
- ],
357
- "conversations": [
358
- {
359
- "from": "human",
360
- "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is talking to a young woman in a room with books on the shelves.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
361
- },
362
- {
363
- "from": "gpt",
364
- "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
365
- }
366
- ]
367
- }
368
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/image1.jpg DELETED
Binary file (37.3 kB)
 
examples/image10.jpg DELETED
Binary file (13.6 kB)
 
examples/image11.jpg DELETED
Binary file (7.19 kB)
 
examples/image12.jpg DELETED
Binary file (155 kB)
 
examples/image13.jpg DELETED
Binary file (59.1 kB)
 
examples/image14.jpg DELETED
Binary file (99.9 kB)
 
examples/image15.jpg DELETED
Binary file (60.7 kB)
 
examples/image2.jpg DELETED
Binary file (9.86 kB)
 
examples/image3.jpg DELETED
Binary file (40.7 kB)
 
examples/image4.jpg DELETED
Binary file (59.7 kB)
 
examples/image5.jpg DELETED
Binary file (45.2 kB)
 
examples/image6.jpg DELETED
Binary file (45.6 kB)
 
examples/image7.jpg DELETED
Binary file (41.3 kB)
 
examples/image8.jpg DELETED
Binary file (32.2 kB)
 
examples/image9.jpg DELETED
Binary file (61.1 kB)
 
examples/r000228/r000228_00.jpg DELETED
Binary file (15.5 kB)
 
examples/r000228/r000228_01.jpg DELETED
Binary file (16.2 kB)
 
examples/r000228/r000228_02.jpg DELETED
Binary file (16.2 kB)