diff --git a/.gitignore b/.gitignore
index 082c0a313cbf554b2d5be1e68f969abf3d53ac11..e0f3945d1c6a604a2e4ab4611564f460392263ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -156,4 +156,6 @@ cython_debug/
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
\ No newline at end of file
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+
+/.vscode
\ No newline at end of file
diff --git a/app.py b/app.py
index be8a7ad57b5dc5100affc2fda86fc1f482aa34b8..e4c6b74477ad6b67a80285e3a17e257fd8bd458a 100644
--- a/app.py
+++ b/app.py
@@ -3,13 +3,16 @@ import spaces
 import os
 import time
 import json
+import numpy as np
+import av
 from PIL import Image
 import functools
 from transformers import AutoProcessor, Idefics2ForConditionalGeneration
 from models.conversation import conv_templates
 from typing import List
-processor = AutoProcessor.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
-model = Idefics2ForConditionalGeneration.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora")
+processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-95k-mantis-2epoch_4096")
+model = Idefics2ForConditionalGeneration.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-95k-mantis-2epoch_4096")
+MAX_NUM_FRAMES = 24
 conv_template = conv_templates["idefics_2"]
 
 with open("./examples/all_subsets.json", 'r') as f:
@@ -18,8 +21,33 @@ with open("./examples/all_subsets.json", 'r') as f:
 for item in examples:
     video_id = item['images'][0].split("_")[0]
     item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
-
-prompt = "Suppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the beautiful girl, long hair,walk on the sity street, red cloth  ,\nall the frames of video are as follows: \n\n"
+    item['video'] = os.path.join("./examples", item['video'])
+
+VIDEO_EVAL_PROMPT = """
+Suppose you are an expert in judging and evaluating the quality of AI-generated videos, 
+please watch the following frames of a given video and see the text prompt for generating the video, 
+then give scores from 7 different dimensions:
+(1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
+(2) object consistency, the consistency of objects or humans in video
+(3) dynamic degree, the degree of dynamic changes
+(4) motion smoothness, the smoothness of motion or movements
+(5) text-to-video alignment, the alignment between the text prompt and the video content
+(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
+(7) overall score, the overall quality of the video
+for each dimension, output a number from [1,2,3], in which '1' is 'Bad', '2' is 'Average', '3' is 'Good'.
+Here is an output example: 
+visual quality: 3
+object consistency: 2 
+dynamic degree: 2
+motion smoothness: 1
+text-to-video alignment: 1
+factual consistency: 2
+overall score: 1
+
+For this video, the text prompt is "{text_prompt}",
+all the frames of video are as follows:
+
+"""
 @spaces.GPU
 def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     global processor, model
@@ -32,6 +60,14 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     
     idefics_2_message = []
     cur_img_idx = 0
+    cur_vid_idx = 0
+    all_videos = [x for x in images if isinstance(x, list)]
+    flatten_images = []
+    for x in images:
+        if isinstance(x, list):
+            flatten_images.extend(x)
+        else:
+            flatten_images.append(x)
     
     print(history)
     for i, message in enumerate(history):
@@ -41,6 +77,11 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
                 "content": []
             })
             message_text = message["text"]
+            num_video_tokens_in_text = message_text.count("<video>")
+            if num_video_tokens_in_text > 0:
+                for _ in range(num_video_tokens_in_text):
+                    message_text = message_text.replace("<video>", "<image> " * len(all_videos[cur_vid_idx]), 1)
+                    cur_vid_idx += 1
             num_image_tokens_in_text = message_text.count("<image>")
             if num_image_tokens_in_text > 0:
                 sub_texts = [x.strip() for x in message_text.split("<image>")]
@@ -72,185 +113,111 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs):
     print(idefics_2_message)
     prompt = processor.apply_chat_template(idefics_2_message, add_generation_prompt=True) 
     
-    images = [Image.open(x) for x in images]
+    images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
     inputs = processor(text=prompt, images=images, return_tensors="pt")
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     outputs = model.generate(**inputs, max_new_tokens=1024)
     generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
     return generated_text
 
-def enable_next_image(uploaded_images, image):
-    uploaded_images.append(image)
-    return uploaded_images, gr.MultimodalTextbox(value=None, interactive=False)
-
-def add_message(history, message):
-    if message["files"]:
-        for file in message["files"]:
-            history.append([(file,), None])
-    if message["text"]:
-        history.append([message["text"], None])
-    return history, gr.MultimodalTextbox(value=None)
-
-def print_like_dislike(x: gr.LikeData):
-    print(x.index, x.value, x.liked)
-
-
-def get_chat_history(history):
-    chat_history = []
-    user_role = conv_template.roles[0]
-    assistant_role = conv_template.roles[1]
-    for i, message in enumerate(history):
-        if isinstance(message[0], str):
-            chat_history.append({"role": user_role, "text": message[0]})
-            if i != len(history) - 1:
-                assert message[1], "The bot message is not provided, internal error"
-                chat_history.append({"role": assistant_role, "text": message[1]})
-            else:
-                assert not message[1], "the bot message internal error, get: {}".format(message[1])
-                chat_history.append({"role": assistant_role, "text": ""})
-    return chat_history
-
-
-def get_chat_images(history):
-    images = []
-    for message in history:
-        if isinstance(message[0], tuple):
-            images.extend(message[0])
-    return images
-
-
-def bot(history):
-    cur_messages = {"text": "", "images": []}
-    for message in history[::-1]:
-        if message[1]:
-            break
-        if isinstance(message[0], str):
-            cur_messages["text"] = message[0] + " " + cur_messages["text"]
-        elif isinstance(message[0], tuple):
-            cur_messages["images"].extend(message[0])
-    cur_messages["text"] = cur_messages["text"].strip()
-    cur_messages["images"] = cur_messages["images"][::-1]
-    if not cur_messages["text"]:
-        raise gr.Error("Please enter a message")
-    if cur_messages['text'].count("<image>") < len(cur_messages['images']):
-        gr.Warning("The number of images uploaded is more than the number of <image> placeholders in the text. Will automatically prepend <image> to the text.")
-        cur_messages['text'] += "<image> "* (len(cur_messages['images']) - cur_messages['text'].count("<image>"))
-        history[-1][0] = cur_messages["text"]
-    if cur_messages['text'].count("<image>") > len(cur_messages['images']):
-        gr.Warning("The number of images uploaded is less than the number of <image> placeholders in the text. Will automatically remove extra <image> placeholders from the text.")
-        cur_messages['text'] = cur_messages['text'][::-1].replace("<image>"[::-1], "", cur_messages['text'].count("<image>") - len(cur_messages['images']))[::-1]
-        history[-1][0] = cur_messages["text"]
-        
     
+def read_video_pyav(container, indices):
+    '''
+    Decode the video with PyAV decoder.
+
+    Args:
+        container (av.container.input.InputContainer): PyAV container.
+        indices (List[int]): List of frame indices to decode.
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    '''
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+def eval_video(prompt, video:str):
+    container = av.open(video)
+
+    # sample uniformly 8 frames from the video
+    total_frames = container.streams.video[0].frames
+    if total_frames > MAX_NUM_FRAMES:
+        indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
+    else:
+        indices = np.arange(total_frames)
+    video_frames = read_video_pyav(container, indices)
+
+    frames = [Image.fromarray(x) for x in video_frames]
     
-    chat_history = get_chat_history(history)
-    chat_images = get_chat_images(history)
-    
-    generation_kwargs = {
-        "max_new_tokens": 4096,
-        "num_beams": 1,
-        "do_sample": False
-    }
-    
-    response = generate(None, chat_images, chat_history, **generation_kwargs) 
-    history[-1][1] = response
-    return history
-    # for _output in response:
-    #     history[-1][1] = _output
-    #     time.sleep(0.05)
-    #     yield history
+    eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
+    eval_prompt += "<video>"
+    user_role = conv_template.roles[0]
+    assistant_role = conv_template.roles[1]
+    chat_messages = [
+        {
+            "role": user_role,
+            "text": eval_prompt
+        },
+        {
+            "role": assistant_role,
+            "text": ""
+        }
+    ]
+    response = generate(None, [frames], chat_messages)
+    return response
 
-def get_images(video_folder:str):
-    """
-    video folder contains images files like {video_folder_name}_00.jpg, {video_folder_name}_01.jpg, ...
-    """
-    images = []
-    for file in os.listdir(video_folder):
-        if file.endswith(".jpg"):
-            images.append(Image.open(os.path.join(video_folder, file)))
-    # sort images by name
-    images.sort(key=lambda x: int(x.filename.split("_")[-1].split(".")[0]))
-    return images
-        
 def build_demo():
     with gr.Blocks() as demo:
-        
-        gr.Markdown(""" # Mantis
-Mantis is a multimodal conversational AI model that can chat with users about images and text. It's optimized for multi-image reasoning, where inverleaved text and images can be used to generate responses.
-
-### [Paper](https://arxiv.org/abs/2405.01483) | [Github](https://github.com/TIGER-AI-Lab/Mantis) | [Models](https://huggingface.co/collections/TIGER-Lab/mantis-6619b0834594c878cdb1d6e4) | [Dataset](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct) | [Website](https://tiger-ai-lab.github.io/Mantis/)            
-        """)
-        
-        gr.Markdown("""## Chat with Mantis
-        Mantis supports interleaved text-image input format, where you can simply use the placeholder `<image>` to indicate the position of uploaded images.
-        The model is optimized for multi-image reasoning, while preserving the ability to chat about text and images in a single conversation.
+        gr.Markdown("""
+            ## Video Evaluation
+            upload a video along with a text prompt when generating the video, this model will evaluate the video's quality from 7 different dimensions.
         """)
-        
-        chatbot = gr.Chatbot(line_breaks=True)
-        chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload images. Please use <image> to indicate the position of uploaded images", show_label=True)
-        
-        chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
-        
-        """
-        with gr.Accordion(label='Advanced options', open=False):
-            temperature = gr.Slider(
-                label='Temperature',
-                minimum=0.1,
-                maximum=2.0,
-                step=0.1,
-                value=0.2,
-                interactive=True
-            )
-            top_p = gr.Slider(
-                label='Top-p',
-                minimum=0.05,
-                maximum=1.0,
-                step=0.05,
-                value=1.0,
-                interactive=True
-            )
-        """
-
-        bot_msg = chat_msg.success(bot, chatbot, chatbot, api_name="bot_response")
-        
-        chatbot.like(print_like_dislike, None, None)
-
         with gr.Row():
-            send_button = gr.Button("Send")
-            clear_button = gr.ClearButton([chatbot, chat_input])
-
-        send_button.click(
-            add_message, [chatbot, chat_input], [chatbot, chat_input]
-        ).then(
-            bot, chatbot, chatbot, api_name="bot_response"
+            video = gr.Video(width=500, label="Video")
+            with gr.Column():
+                video_prompt = gr.Textbox(label="Text Prompt", interactive=False, lines=1)
+                with gr.Row():
+                    eval_button = gr.Button("Evaluate Video")
+                    clear_button = gr.ClearButton([video])
+                eval_result = gr.Textbox(label="Evaluation result", interactive=False, lines=7)
+        
+        eval_button.click(
+            eval_video, [video_prompt, video], [eval_result]
         )
-        dummy_id = gr.Textbox("dummy_id", label="dummy_id", visible=False)
-        dummy_output = gr.Textbox("dummy_output", label="dummy_output", visible=False)
+        
+        dummy_id = gr.Textbox("id", label="id", visible=False, min_width=50)
+        dummy_output = gr.Textbox("reference score", label="reference scores", visible=False, lines=7)
         
         gr.Examples(
-            examples=[
+            examples=
+            [
                 [
                     item['id'], 
-                    {
-                        "text": item['conversations'][0]['value'],
-                        "files": item['images']
-                    },
+                    item['prompt'],
+                    item['video'],
                     item['conversations'][1]['value']
                 ] for item in examples
             ],
-            inputs=[dummy_id, chat_input, dummy_output],
+            inputs=[dummy_id, video_prompt, video, dummy_output],
         )        
         
-        gr.Markdown("""
-## Citation
-```
-@article{jiang2024mantis,
-  title={MANTIS: Interleaved Multi-Image Instruction Tuning},
-  author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
-  journal={arXiv preprint arXiv:2405.01483},
-  year={2024}
-}
-```""")
+#         gr.Markdown("""
+# ## Citation
+# ```
+# @article{jiang2024mantis,
+#   title={MANTIS: Interleaved Multi-Image Instruction Tuning},
+#   author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
+#   journal={arXiv preprint arXiv:2405.01483},
+#   year={2024}
+# }
+# ```""")
     return demo    
     
 
diff --git a/examples/1006309.mp4 b/examples/1006309.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..f0307596de499ac6772a88869d3dc2fafaaf8aa7
Binary files /dev/null and b/examples/1006309.mp4 differ
diff --git a/examples/2006733/2006733_00.jpg b/examples/2006733/2006733_00.jpg
deleted file mode 100644
index af706a4543e45e5f2ee03db8b9bd44e077c47b3d..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_00.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_01.jpg b/examples/2006733/2006733_01.jpg
deleted file mode 100644
index 548cd727befd21b5f0428c4c85555291b3359373..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_01.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_02.jpg b/examples/2006733/2006733_02.jpg
deleted file mode 100644
index 8d0b397fede31b17a724d67e676cab4302a86b6c..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_02.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_03.jpg b/examples/2006733/2006733_03.jpg
deleted file mode 100644
index 4ed5ce0bb2f4206706d442f329828d3dd30bd38c..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_03.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_04.jpg b/examples/2006733/2006733_04.jpg
deleted file mode 100644
index 19c0f07f09de4a9417e3889e05ceb445d631e734..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_04.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_05.jpg b/examples/2006733/2006733_05.jpg
deleted file mode 100644
index 9baf346971dda485f6414e020a7ab2a9f37c10b4..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_05.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_06.jpg b/examples/2006733/2006733_06.jpg
deleted file mode 100644
index a07332f3097aa983c807d4eccc75ebc4a2a2d0be..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_06.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_07.jpg b/examples/2006733/2006733_07.jpg
deleted file mode 100644
index e43d7d258e3eec3e3ca6682723873950f77c8dd1..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_07.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_08.jpg b/examples/2006733/2006733_08.jpg
deleted file mode 100644
index 1f80a6c364acbf9bd106a9c39bfa1f2bd7f6c851..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_08.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_09.jpg b/examples/2006733/2006733_09.jpg
deleted file mode 100644
index 40ce8baaaf86f4531316eac50d8cef8096943008..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_09.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_10.jpg b/examples/2006733/2006733_10.jpg
deleted file mode 100644
index fc346fa38ea8f744b3fdf5ed32e9b7c9e471f8e3..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_10.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_11.jpg b/examples/2006733/2006733_11.jpg
deleted file mode 100644
index 4c511b19e372093c70cb5c33ba21c06fda5fcf88..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_11.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_12.jpg b/examples/2006733/2006733_12.jpg
deleted file mode 100644
index c2c2fc9a245e823e01fcd53c046f4ee7065fb4fa..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_12.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_13.jpg b/examples/2006733/2006733_13.jpg
deleted file mode 100644
index 533ffaf9edc564e5a1ba3ff04f407a4d8069d778..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_13.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_14.jpg b/examples/2006733/2006733_14.jpg
deleted file mode 100644
index 0dbb86ba271276b8ea9808b6344f2442805179ee..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_14.jpg and /dev/null differ
diff --git a/examples/2006733/2006733_15.jpg b/examples/2006733/2006733_15.jpg
deleted file mode 100644
index c9b89070d275aba2151c2662c0b7b87df265ec0e..0000000000000000000000000000000000000000
Binary files a/examples/2006733/2006733_15.jpg and /dev/null differ
diff --git a/examples/3005033.mp4 b/examples/3005033.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9941405b4494789d467b7d8516f080927f2ba772
Binary files /dev/null and b/examples/3005033.mp4 differ
diff --git a/examples/7004180.mp4 b/examples/7004180.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..ba2da238eaffce4c0d139917aacaf3968025f62b
Binary files /dev/null and b/examples/7004180.mp4 differ
diff --git a/examples/a400480.mp4 b/examples/a400480.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..d55fbecf6d521aa2d3da3b2dec77807caed0db49
Binary files /dev/null and b/examples/a400480.mp4 differ
diff --git a/examples/a500010.mp4 b/examples/a500010.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9475207ce9df6757cfbdde68b338d86116d016e9
Binary files /dev/null and b/examples/a500010.mp4 differ
diff --git a/examples/a500251.mp4 b/examples/a500251.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fbc4a5e05ba2b58771ca8e6a41c8b15c527ebc79
Binary files /dev/null and b/examples/a500251.mp4 differ
diff --git a/examples/all_subsets.json b/examples/all_subsets.json
index 36147c800c9abe3eea9054bde51de41f98e3691d..72ef258dcd1bc84aa85d734ced9e082d86192963 100644
--- a/examples/all_subsets.json
+++ b/examples/all_subsets.json
@@ -1 +1,475 @@
-[{"id": "real_r003679", "images": ["r003679_00.jpg", "r003679_01.jpg", "r003679_02.jpg", "r003679_03.jpg", "r003679_04.jpg", "r003679_05.jpg", "r003679_06.jpg", "r003679_07.jpg", "r003679_08.jpg", "r003679_09.jpg", "r003679_10.jpg", "r003679_11.jpg", "r003679_12.jpg", "r003679_13.jpg", "r003679_14.jpg", "r003679_15.jpg", "r003679_16.jpg", "r003679_17.jpg", "r003679_18.jpg", "r003679_19.jpg", "r003679_20.jpg", "r003679_21.jpg", "r003679_22.jpg", "r003679_23.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "real_r100916", "images": ["r100916_00.jpg", "r100916_01.jpg", "r100916_02.jpg", "r100916_03.jpg", "r100916_04.jpg", "r100916_05.jpg", "r100916_06.jpg", "r100916_07.jpg", "r100916_08.jpg", "r100916_09.jpg", "r100916_10.jpg", "r100916_11.jpg", "r100916_12.jpg", "r100916_13.jpg", "r100916_14.jpg", "r100916_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "real_r004061", "images": ["r004061_00.jpg", "r004061_01.jpg", "r004061_02.jpg", "r004061_03.jpg", "r004061_04.jpg", "r004061_05.jpg", "r004061_06.jpg", "r004061_07.jpg", "r004061_08.jpg", "r004061_09.jpg", "r004061_10.jpg", "r004061_11.jpg", "r004061_12.jpg", "r004061_13.jpg", "r004061_14.jpg", "r004061_15.jpg", "r004061_16.jpg", "r004061_17.jpg", "r004061_18.jpg", "r004061_19.jpg", "r004061_20.jpg", "r004061_21.jpg", "r004061_22.jpg", "r004061_23.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "worsen_gen_b404675", "images": ["b404675_00.jpg", "b404675_01.jpg", "b404675_02.jpg", "b404675_03.jpg", "b404675_04.jpg", "b404675_05.jpg", "b404675_06.jpg", "b404675_07.jpg", "b404675_08.jpg", "b404675_09.jpg", "b404675_10.jpg", "b404675_11.jpg", "b404675_12.jpg", "b404675_13.jpg", "b404675_14.jpg", "b404675_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is After a few years, Ghulam Muhammad became the owner of his business  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"}]}, {"id": "worsen_gen_b402727", "images": ["b402727_00.jpg", "b402727_01.jpg", "b402727_02.jpg", "b402727_03.jpg", "b402727_04.jpg", "b402727_05.jpg", "b402727_06.jpg", "b402727_07.jpg", "b402727_08.jpg", "b402727_09.jpg", "b402727_10.jpg", "b402727_11.jpg", "b402727_12.jpg", "b402727_13.jpg", "b402727_14.jpg", "b402727_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is view down to the path in the field, moving forward, night, dark.  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 1\n"}]}, {"id": "worsen_gen_b304986", "images": ["b304986_00.jpg", "b304986_01.jpg", "b304986_02.jpg", "b304986_03.jpg", "b304986_04.jpg", "b304986_05.jpg", "b304986_06.jpg", "b304986_07.jpg", "b304986_08.jpg", "b304986_09.jpg", "b304986_10.jpg", "b304986_11.jpg", "b304986_12.jpg", "b304986_13.jpg", "b304986_14.jpg", "b304986_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is Old Philippino man, close up, subtle smile, teats of joy, in old wood room  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"}]}, {"id": "static_d500937", "images": ["d500937_00.jpg", "d500937_01.jpg", "d500937_02.jpg", "d500937_03.jpg", "d500937_04.jpg", "d500937_05.jpg", "d500937_06.jpg", "d500937_07.jpg", "d500937_08.jpg", "d500937_09.jpg", "d500937_10.jpg", "d500937_11.jpg", "d500937_12.jpg", "d500937_13.jpg", "d500937_14.jpg", "d500937_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is write y in the garden with flowers  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "static_d500506", "images": ["d500506_00.jpg", "d500506_01.jpg", "d500506_02.jpg", "d500506_03.jpg", "d500506_04.jpg", "d500506_05.jpg", "d500506_06.jpg", "d500506_07.jpg", "d500506_08.jpg", "d500506_09.jpg", "d500506_10.jpg", "d500506_11.jpg", "d500506_12.jpg", "d500506_13.jpg", "d500506_14.jpg", "d500506_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a boy paiting a beautiful landscape  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "static_d401950", "images": ["d401950_00.jpg", "d401950_01.jpg", "d401950_02.jpg", "d401950_03.jpg", "d401950_04.jpg", "d401950_05.jpg", "d401950_06.jpg", "d401950_07.jpg", "d401950_08.jpg", "d401950_09.jpg", "d401950_10.jpg", "d401950_11.jpg", "d401950_12.jpg", "d401950_13.jpg", "d401950_14.jpg", "d401950_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is sneaker with exclusive design worth 1 billion  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "insf_a500251", "images": ["a500251_00.jpg", "a500251_01.jpg", "a500251_02.jpg", "a500251_03.jpg", "a500251_04.jpg", "a500251_05.jpg", "a500251_06.jpg", "a500251_07.jpg", "a500251_08.jpg", "a500251_09.jpg", "a500251_10.jpg", "a500251_11.jpg", "a500251_12.jpg", "a500251_13.jpg", "a500251_14.jpg", "a500251_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is :sci fi war in the moon  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "insf_a400480", "images": ["a400480_00.jpg", "a400480_01.jpg", "a400480_02.jpg", "a400480_03.jpg", "a400480_04.jpg", "a400480_05.jpg", "a400480_06.jpg", "a400480_07.jpg", "a400480_08.jpg", "a400480_09.jpg", "a400480_10.jpg", "a400480_11.jpg", "a400480_12.jpg", "a400480_13.jpg", "a400480_14.jpg", "a400480_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is for a dog accessories brand a welcome image: brand name: hot doggies  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "insf_a500010", "images": ["a500010_00.jpg", "a500010_01.jpg", "a500010_02.jpg", "a500010_03.jpg", "a500010_04.jpg", "a500010_05.jpg", "a500010_06.jpg", "a500010_07.jpg", "a500010_08.jpg", "a500010_09.jpg", "a500010_10.jpg", "a500010_11.jpg", "a500010_12.jpg", "a500010_13.jpg", "a500010_14.jpg", "a500010_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a druid in the forest cinematic  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"}]}, {"id": "lab_3005033", "images": ["3005033_00.jpg", "3005033_01.jpg", "3005033_02.jpg", "3005033_03.jpg", "3005033_04.jpg", "3005033_05.jpg", "3005033_06.jpg", "3005033_07.jpg", "3005033_08.jpg", "3005033_09.jpg", "3005033_10.jpg", "3005033_11.jpg", "3005033_12.jpg", "3005033_13.jpg", "3005033_14.jpg", "3005033_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a robot vacuum traveling through southeast asian countries  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 2\n"}]}, {"id": "lab_7004180", "images": ["7004180_00.jpg", "7004180_01.jpg", "7004180_02.jpg", "7004180_03.jpg", "7004180_04.jpg", "7004180_05.jpg", "7004180_06.jpg", "7004180_07.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is jiaxuan is the most beautiful \uff0celegant\uff0ccharming girl in the world. billions of boys fell in love with her  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"}]}, {"id": "lab_1006309", "images": ["1006309_00.jpg", "1006309_01.jpg", "1006309_02.jpg", "1006309_03.jpg", "1006309_04.jpg", "1006309_05.jpg", "1006309_06.jpg", "1006309_07.jpg", "1006309_08.jpg", "1006309_09.jpg", "1006309_10.jpg", "1006309_11.jpg", "1006309_12.jpg", "1006309_13.jpg", "1006309_14.jpg", "1006309_15.jpg"], "conversations": [{"from": "human", "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is scary halloween skeletons are having a conversation with each other  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"}, {"from": "gpt", "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 3\n factual consistency: 2\n overall score: 3\n"}]}]
\ No newline at end of file
+[
+    {
+        "id": "real_r003679",
+        "images": [
+            "r003679_00.jpg",
+            "r003679_01.jpg",
+            "r003679_02.jpg",
+            "r003679_03.jpg",
+            "r003679_04.jpg",
+            "r003679_05.jpg",
+            "r003679_06.jpg",
+            "r003679_07.jpg",
+            "r003679_08.jpg",
+            "r003679_09.jpg",
+            "r003679_10.jpg",
+            "r003679_11.jpg",
+            "r003679_12.jpg",
+            "r003679_13.jpg",
+            "r003679_14.jpg",
+            "r003679_15.jpg",
+            "r003679_16.jpg",
+            "r003679_17.jpg",
+            "r003679_18.jpg",
+            "r003679_19.jpg",
+            "r003679_20.jpg",
+            "r003679_21.jpg",
+            "r003679_22.jpg",
+            "r003679_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "real_r100916",
+        "images": [
+            "r100916_00.jpg",
+            "r100916_01.jpg",
+            "r100916_02.jpg",
+            "r100916_03.jpg",
+            "r100916_04.jpg",
+            "r100916_05.jpg",
+            "r100916_06.jpg",
+            "r100916_07.jpg",
+            "r100916_08.jpg",
+            "r100916_09.jpg",
+            "r100916_10.jpg",
+            "r100916_11.jpg",
+            "r100916_12.jpg",
+            "r100916_13.jpg",
+            "r100916_14.jpg",
+            "r100916_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "real_r004061",
+        "images": [
+            "r004061_00.jpg",
+            "r004061_01.jpg",
+            "r004061_02.jpg",
+            "r004061_03.jpg",
+            "r004061_04.jpg",
+            "r004061_05.jpg",
+            "r004061_06.jpg",
+            "r004061_07.jpg",
+            "r004061_08.jpg",
+            "r004061_09.jpg",
+            "r004061_10.jpg",
+            "r004061_11.jpg",
+            "r004061_12.jpg",
+            "r004061_13.jpg",
+            "r004061_14.jpg",
+            "r004061_15.jpg",
+            "r004061_16.jpg",
+            "r004061_17.jpg",
+            "r004061_18.jpg",
+            "r004061_19.jpg",
+            "r004061_20.jpg",
+            "r004061_21.jpg",
+            "r004061_22.jpg",
+            "r004061_23.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "worsen_gen_b404675",
+        "images": [
+            "b404675_00.jpg",
+            "b404675_01.jpg",
+            "b404675_02.jpg",
+            "b404675_03.jpg",
+            "b404675_04.jpg",
+            "b404675_05.jpg",
+            "b404675_06.jpg",
+            "b404675_07.jpg",
+            "b404675_08.jpg",
+            "b404675_09.jpg",
+            "b404675_10.jpg",
+            "b404675_11.jpg",
+            "b404675_12.jpg",
+            "b404675_13.jpg",
+            "b404675_14.jpg",
+            "b404675_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is After a few years, Ghulam Muhammad became the owner of his business  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "worsen_gen_b402727",
+        "images": [
+            "b402727_00.jpg",
+            "b402727_01.jpg",
+            "b402727_02.jpg",
+            "b402727_03.jpg",
+            "b402727_04.jpg",
+            "b402727_05.jpg",
+            "b402727_06.jpg",
+            "b402727_07.jpg",
+            "b402727_08.jpg",
+            "b402727_09.jpg",
+            "b402727_10.jpg",
+            "b402727_11.jpg",
+            "b402727_12.jpg",
+            "b402727_13.jpg",
+            "b402727_14.jpg",
+            "b402727_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is view down to the path in the field, moving forward, night, dark.  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "worsen_gen_b304986",
+        "images": [
+            "b304986_00.jpg",
+            "b304986_01.jpg",
+            "b304986_02.jpg",
+            "b304986_03.jpg",
+            "b304986_04.jpg",
+            "b304986_05.jpg",
+            "b304986_06.jpg",
+            "b304986_07.jpg",
+            "b304986_08.jpg",
+            "b304986_09.jpg",
+            "b304986_10.jpg",
+            "b304986_11.jpg",
+            "b304986_12.jpg",
+            "b304986_13.jpg",
+            "b304986_14.jpg",
+            "b304986_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is Old Philippino man, close up, subtle smile, teats of joy, in old wood room  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "static_d500937",
+        "images": [
+            "d500937_00.jpg",
+            "d500937_01.jpg",
+            "d500937_02.jpg",
+            "d500937_03.jpg",
+            "d500937_04.jpg",
+            "d500937_05.jpg",
+            "d500937_06.jpg",
+            "d500937_07.jpg",
+            "d500937_08.jpg",
+            "d500937_09.jpg",
+            "d500937_10.jpg",
+            "d500937_11.jpg",
+            "d500937_12.jpg",
+            "d500937_13.jpg",
+            "d500937_14.jpg",
+            "d500937_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is write y in the garden with flowers  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "static_d500506",
+        "images": [
+            "d500506_00.jpg",
+            "d500506_01.jpg",
+            "d500506_02.jpg",
+            "d500506_03.jpg",
+            "d500506_04.jpg",
+            "d500506_05.jpg",
+            "d500506_06.jpg",
+            "d500506_07.jpg",
+            "d500506_08.jpg",
+            "d500506_09.jpg",
+            "d500506_10.jpg",
+            "d500506_11.jpg",
+            "d500506_12.jpg",
+            "d500506_13.jpg",
+            "d500506_14.jpg",
+            "d500506_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a boy paiting a beautiful landscape  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "static_d401950",
+        "images": [
+            "d401950_00.jpg",
+            "d401950_01.jpg",
+            "d401950_02.jpg",
+            "d401950_03.jpg",
+            "d401950_04.jpg",
+            "d401950_05.jpg",
+            "d401950_06.jpg",
+            "d401950_07.jpg",
+            "d401950_08.jpg",
+            "d401950_09.jpg",
+            "d401950_10.jpg",
+            "d401950_11.jpg",
+            "d401950_12.jpg",
+            "d401950_13.jpg",
+            "d401950_14.jpg",
+            "d401950_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is sneaker with exclusive design worth 1 billion  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 3\n dynamic degree: 1\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "insf_a500251",
+        "images": [
+            "a500251_00.jpg",
+            "a500251_01.jpg",
+            "a500251_02.jpg",
+            "a500251_03.jpg",
+            "a500251_04.jpg",
+            "a500251_05.jpg",
+            "a500251_06.jpg",
+            "a500251_07.jpg",
+            "a500251_08.jpg",
+            "a500251_09.jpg",
+            "a500251_10.jpg",
+            "a500251_11.jpg",
+            "a500251_12.jpg",
+            "a500251_13.jpg",
+            "a500251_14.jpg",
+            "a500251_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is :sci fi war in the moon  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "insf_a400480",
+        "images": [
+            "a400480_00.jpg",
+            "a400480_01.jpg",
+            "a400480_02.jpg",
+            "a400480_03.jpg",
+            "a400480_04.jpg",
+            "a400480_05.jpg",
+            "a400480_06.jpg",
+            "a400480_07.jpg",
+            "a400480_08.jpg",
+            "a400480_09.jpg",
+            "a400480_10.jpg",
+            "a400480_11.jpg",
+            "a400480_12.jpg",
+            "a400480_13.jpg",
+            "a400480_14.jpg",
+            "a400480_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is for a dog accessories brand a welcome image: brand name: hot doggies  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "insf_a500010",
+        "images": [
+            "a500010_00.jpg",
+            "a500010_01.jpg",
+            "a500010_02.jpg",
+            "a500010_03.jpg",
+            "a500010_04.jpg",
+            "a500010_05.jpg",
+            "a500010_06.jpg",
+            "a500010_07.jpg",
+            "a500010_08.jpg",
+            "a500010_09.jpg",
+            "a500010_10.jpg",
+            "a500010_11.jpg",
+            "a500010_12.jpg",
+            "a500010_13.jpg",
+            "a500010_14.jpg",
+            "a500010_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a druid in the forest cinematic  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 1\n object consistency: 1\n dynamic degree: 3\n motion smoothness: 1\n text-to-video alignment: 1\n factual consistency: 1\n overall score: 1\n"
+            }
+        ]
+    },
+    {
+        "id": "lab_3005033",
+        "images": [
+            "3005033_00.jpg",
+            "3005033_01.jpg",
+            "3005033_02.jpg",
+            "3005033_03.jpg",
+            "3005033_04.jpg",
+            "3005033_05.jpg",
+            "3005033_06.jpg",
+            "3005033_07.jpg",
+            "3005033_08.jpg",
+            "3005033_09.jpg",
+            "3005033_10.jpg",
+            "3005033_11.jpg",
+            "3005033_12.jpg",
+            "3005033_13.jpg",
+            "3005033_14.jpg",
+            "3005033_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a robot vacuum traveling through southeast asian countries  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 2\n factual consistency: 3\n overall score: 2\n"
+            }
+        ]
+    },
+    {
+        "id": "lab_7004180",
+        "images": [
+            "7004180_00.jpg",
+            "7004180_01.jpg",
+            "7004180_02.jpg",
+            "7004180_03.jpg",
+            "7004180_04.jpg",
+            "7004180_05.jpg",
+            "7004180_06.jpg",
+            "7004180_07.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is jiaxuan is the most beautiful \uff0celegant\uff0ccharming girl in the world. billions of boys fell in love with her  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
+            }
+        ]
+    },
+    {
+        "id": "lab_1006309",
+        "images": [
+            "1006309_00.jpg",
+            "1006309_01.jpg",
+            "1006309_02.jpg",
+            "1006309_03.jpg",
+            "1006309_04.jpg",
+            "1006309_05.jpg",
+            "1006309_06.jpg",
+            "1006309_07.jpg",
+            "1006309_08.jpg",
+            "1006309_09.jpg",
+            "1006309_10.jpg",
+            "1006309_11.jpg",
+            "1006309_12.jpg",
+            "1006309_13.jpg",
+            "1006309_14.jpg",
+            "1006309_15.jpg"
+        ],
+        "conversations": [
+            {
+                "from": "human",
+                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is scary halloween skeletons are having a conversation with each other  ,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
+            },
+            {
+                "from": "gpt",
+                "value": "visual quality: 3\n object consistency: 2\n dynamic degree: 3\n motion smoothness: 2\n text-to-video alignment: 3\n factual consistency: 2\n overall score: 3\n"
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/examples/b304986.mp4 b/examples/b304986.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a26038ace69e3b2489a5950f010396dcd273ec57
Binary files /dev/null and b/examples/b304986.mp4 differ
diff --git a/examples/b402727.mp4 b/examples/b402727.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..871471d9df827e0102c4f55e36def6eb49eea0fa
Binary files /dev/null and b/examples/b402727.mp4 differ
diff --git a/examples/b404675.mp4 b/examples/b404675.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c968461d2f7cc056647e7468d52b6af1bc3b4246
Binary files /dev/null and b/examples/b404675.mp4 differ
diff --git a/examples/d401950.mp4 b/examples/d401950.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1edd0c33e274cb6d22e58cda460eb9a19b93eb43
Binary files /dev/null and b/examples/d401950.mp4 differ
diff --git a/examples/d500506.mp4 b/examples/d500506.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fcb8ca9aa5fd76855694352d530a6f9241709725
Binary files /dev/null and b/examples/d500506.mp4 differ
diff --git a/examples/d500937.mp4 b/examples/d500937.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9254cf62e1a8a6365c01f969091ab29ed4b5adf0
Binary files /dev/null and b/examples/d500937.mp4 differ
diff --git a/examples/data_subset.json b/examples/data_subset.json
deleted file mode 100644
index ae2a3649d52bc370e88ccdbd79234a284e9d8fe8..0000000000000000000000000000000000000000
--- a/examples/data_subset.json
+++ /dev/null
@@ -1,368 +0,0 @@
-[
-    {
-        "id": "r003679",
-        "images": [
-            "r003679_00.jpg",
-            "r003679_01.jpg",
-            "r003679_02.jpg",
-            "r003679_03.jpg",
-            "r003679_04.jpg",
-            "r003679_05.jpg",
-            "r003679_06.jpg",
-            "r003679_07.jpg",
-            "r003679_08.jpg",
-            "r003679_09.jpg",
-            "r003679_10.jpg",
-            "r003679_11.jpg",
-            "r003679_12.jpg",
-            "r003679_13.jpg",
-            "r003679_14.jpg",
-            "r003679_15.jpg",
-            "r003679_16.jpg",
-            "r003679_17.jpg",
-            "r003679_18.jpg",
-            "r003679_19.jpg",
-            "r003679_20.jpg",
-            "r003679_21.jpg",
-            "r003679_22.jpg",
-            "r003679_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is dog rolls over and falls off couch,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r100916",
-        "images": [
-            "r100916_00.jpg",
-            "r100916_01.jpg",
-            "r100916_02.jpg",
-            "r100916_03.jpg",
-            "r100916_04.jpg",
-            "r100916_05.jpg",
-            "r100916_06.jpg",
-            "r100916_07.jpg",
-            "r100916_08.jpg",
-            "r100916_09.jpg",
-            "r100916_10.jpg",
-            "r100916_11.jpg",
-            "r100916_12.jpg",
-            "r100916_13.jpg",
-            "r100916_14.jpg",
-            "r100916_15.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r004061",
-        "images": [
-            "r004061_00.jpg",
-            "r004061_01.jpg",
-            "r004061_02.jpg",
-            "r004061_03.jpg",
-            "r004061_04.jpg",
-            "r004061_05.jpg",
-            "r004061_06.jpg",
-            "r004061_07.jpg",
-            "r004061_08.jpg",
-            "r004061_09.jpg",
-            "r004061_10.jpg",
-            "r004061_11.jpg",
-            "r004061_12.jpg",
-            "r004061_13.jpg",
-            "r004061_14.jpg",
-            "r004061_15.jpg",
-            "r004061_16.jpg",
-            "r004061_17.jpg",
-            "r004061_18.jpg",
-            "r004061_19.jpg",
-            "r004061_20.jpg",
-            "r004061_21.jpg",
-            "r004061_22.jpg",
-            "r004061_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r000228",
-        "images": [
-            "r000228_00.jpg",
-            "r000228_01.jpg",
-            "r000228_02.jpg",
-            "r000228_03.jpg",
-            "r000228_04.jpg",
-            "r000228_05.jpg",
-            "r000228_06.jpg",
-            "r000228_07.jpg",
-            "r000228_08.jpg",
-            "r000228_09.jpg",
-            "r000228_10.jpg",
-            "r000228_11.jpg",
-            "r000228_12.jpg",
-            "r000228_13.jpg",
-            "r000228_14.jpg",
-            "r000228_15.jpg",
-            "r000228_16.jpg",
-            "r000228_17.jpg",
-            "r000228_18.jpg",
-            "r000228_19.jpg",
-            "r000228_20.jpg",
-            "r000228_21.jpg",
-            "r000228_22.jpg",
-            "r000228_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is two people get into sports car,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r100887",
-        "images": [
-            "r100887_00.jpg",
-            "r100887_01.jpg",
-            "r100887_02.jpg",
-            "r100887_03.jpg",
-            "r100887_04.jpg",
-            "r100887_05.jpg",
-            "r100887_06.jpg",
-            "r100887_07.jpg",
-            "r100887_08.jpg",
-            "r100887_09.jpg",
-            "r100887_10.jpg",
-            "r100887_11.jpg",
-            "r100887_12.jpg",
-            "r100887_13.jpg",
-            "r100887_14.jpg",
-            "r100887_15.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A person using a spatula to spread something on a pan.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r104340",
-        "images": [
-            "r104340_00.jpg",
-            "r104340_01.jpg",
-            "r104340_02.jpg",
-            "r104340_03.jpg",
-            "r104340_04.jpg",
-            "r104340_05.jpg",
-            "r104340_06.jpg",
-            "r104340_07.jpg",
-            "r104340_08.jpg",
-            "r104340_09.jpg",
-            "r104340_10.jpg",
-            "r104340_11.jpg",
-            "r104340_12.jpg",
-            "r104340_13.jpg",
-            "r104340_14.jpg",
-            "r104340_15.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A bartender is making an alcoholic drink by pouring a clear liquid into a glass with ice cubes.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r001599",
-        "images": [
-            "r001599_00.jpg",
-            "r001599_01.jpg",
-            "r001599_02.jpg",
-            "r001599_03.jpg",
-            "r001599_04.jpg",
-            "r001599_05.jpg",
-            "r001599_06.jpg",
-            "r001599_07.jpg",
-            "r001599_08.jpg",
-            "r001599_09.jpg",
-            "r001599_10.jpg",
-            "r001599_11.jpg",
-            "r001599_12.jpg",
-            "r001599_13.jpg",
-            "r001599_14.jpg",
-            "r001599_15.jpg",
-            "r001599_16.jpg",
-            "r001599_17.jpg",
-            "r001599_18.jpg",
-            "r001599_19.jpg",
-            "r001599_20.jpg",
-            "r001599_21.jpg",
-            "r001599_22.jpg",
-            "r001599_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is a man uses a camera,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r101298",
-        "images": [
-            "r101298_00.jpg",
-            "r101298_01.jpg",
-            "r101298_02.jpg",
-            "r101298_03.jpg",
-            "r101298_04.jpg",
-            "r101298_05.jpg",
-            "r101298_06.jpg",
-            "r101298_07.jpg",
-            "r101298_08.jpg",
-            "r101298_09.jpg",
-            "r101298_10.jpg",
-            "r101298_11.jpg",
-            "r101298_12.jpg",
-            "r101298_13.jpg",
-            "r101298_14.jpg",
-            "r101298_15.jpg",
-            "r101298_16.jpg",
-            "r101298_17.jpg",
-            "r101298_18.jpg",
-            "r101298_19.jpg",
-            "r101298_20.jpg",
-            "r101298_21.jpg",
-            "r101298_22.jpg",
-            "r101298_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A baseball game is being played in a stadium with a large crowd of people watching.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r000688",
-        "images": [
-            "r000688_00.jpg",
-            "r000688_01.jpg",
-            "r000688_02.jpg",
-            "r000688_03.jpg",
-            "r000688_04.jpg",
-            "r000688_05.jpg",
-            "r000688_06.jpg",
-            "r000688_07.jpg",
-            "r000688_08.jpg",
-            "r000688_09.jpg",
-            "r000688_10.jpg",
-            "r000688_11.jpg",
-            "r000688_12.jpg",
-            "r000688_13.jpg",
-            "r000688_14.jpg",
-            "r000688_15.jpg",
-            "r000688_16.jpg",
-            "r000688_17.jpg",
-            "r000688_18.jpg",
-            "r000688_19.jpg",
-            "r000688_20.jpg",
-            "r000688_21.jpg",
-            "r000688_22.jpg",
-            "r000688_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the child furthest to the right dips her hand into the bowl shared by the other children.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    },
-    {
-        "id": "r104350",
-        "images": [
-            "r104350_00.jpg",
-            "r104350_01.jpg",
-            "r104350_02.jpg",
-            "r104350_03.jpg",
-            "r104350_04.jpg",
-            "r104350_05.jpg",
-            "r104350_06.jpg",
-            "r104350_07.jpg",
-            "r104350_08.jpg",
-            "r104350_09.jpg",
-            "r104350_10.jpg",
-            "r104350_11.jpg",
-            "r104350_12.jpg",
-            "r104350_13.jpg",
-            "r104350_14.jpg",
-            "r104350_15.jpg",
-            "r104350_16.jpg",
-            "r104350_17.jpg",
-            "r104350_18.jpg",
-            "r104350_19.jpg",
-            "r104350_20.jpg",
-            "r104350_21.jpg",
-            "r104350_22.jpg",
-            "r104350_23.jpg"
-        ],
-        "conversations": [
-            {
-                "from": "human",
-                "value": "\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is A young man is talking to a young woman in a room with books on the shelves.,\nall the frames of video are as follows: \n\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \n"
-            },
-            {
-                "from": "gpt",
-                "value": "visual quality: 3\n object consistency: 3\n dynamic degree: 3\n motion smoothness: 3\n text-to-video alignment: 3\n factual consistency: 3\n overall score: 3\n"
-            }
-        ]
-    }
-]
\ No newline at end of file
diff --git a/examples/image1.jpg b/examples/image1.jpg
deleted file mode 100644
index aa96c098ef8153dc71aefb69177da5bf171f5e83..0000000000000000000000000000000000000000
Binary files a/examples/image1.jpg and /dev/null differ
diff --git a/examples/image10.jpg b/examples/image10.jpg
deleted file mode 100644
index 5f90a093d4c6b87af17b04bd931e8e76785b7742..0000000000000000000000000000000000000000
Binary files a/examples/image10.jpg and /dev/null differ
diff --git a/examples/image11.jpg b/examples/image11.jpg
deleted file mode 100644
index e6583aed834ac02fbb4f6a6de4aec6288f44b37f..0000000000000000000000000000000000000000
Binary files a/examples/image11.jpg and /dev/null differ
diff --git a/examples/image12.jpg b/examples/image12.jpg
deleted file mode 100644
index 547b4eb582e58e7d058265355c6ef3581bbb48c3..0000000000000000000000000000000000000000
Binary files a/examples/image12.jpg and /dev/null differ
diff --git a/examples/image13.jpg b/examples/image13.jpg
deleted file mode 100644
index 89abd71e82eb02560ce0c34653333084e7b0402c..0000000000000000000000000000000000000000
Binary files a/examples/image13.jpg and /dev/null differ
diff --git a/examples/image14.jpg b/examples/image14.jpg
deleted file mode 100644
index 731e15e8fe838785fd888cffe642ff5e88f177f2..0000000000000000000000000000000000000000
Binary files a/examples/image14.jpg and /dev/null differ
diff --git a/examples/image15.jpg b/examples/image15.jpg
deleted file mode 100644
index 47e98b8f196743f059ad73d5d9bfa8e78a11445e..0000000000000000000000000000000000000000
Binary files a/examples/image15.jpg and /dev/null differ
diff --git a/examples/image2.jpg b/examples/image2.jpg
deleted file mode 100644
index 1bb3bdf9ddfe73cffd31788acdcfaf90fba1b127..0000000000000000000000000000000000000000
Binary files a/examples/image2.jpg and /dev/null differ
diff --git a/examples/image3.jpg b/examples/image3.jpg
deleted file mode 100644
index 5737feb3042000a02b97925b616391a8fd49ac85..0000000000000000000000000000000000000000
Binary files a/examples/image3.jpg and /dev/null differ
diff --git a/examples/image4.jpg b/examples/image4.jpg
deleted file mode 100644
index 50b92ebdde9b4da85a300fc238d2759cd7138226..0000000000000000000000000000000000000000
Binary files a/examples/image4.jpg and /dev/null differ
diff --git a/examples/image5.jpg b/examples/image5.jpg
deleted file mode 100644
index d0df84ee667c62bbe086ea3e9b63f6596ceadc3e..0000000000000000000000000000000000000000
Binary files a/examples/image5.jpg and /dev/null differ
diff --git a/examples/image6.jpg b/examples/image6.jpg
deleted file mode 100644
index 9af53cf3ec58876717861d6788a3fb93b275abdf..0000000000000000000000000000000000000000
Binary files a/examples/image6.jpg and /dev/null differ
diff --git a/examples/image7.jpg b/examples/image7.jpg
deleted file mode 100644
index 144598c0590b3f25e10901fa5b4d8eb3cd471b52..0000000000000000000000000000000000000000
Binary files a/examples/image7.jpg and /dev/null differ
diff --git a/examples/image8.jpg b/examples/image8.jpg
deleted file mode 100644
index 7c5096cc785b27c84cc177283cb9693199d1905c..0000000000000000000000000000000000000000
Binary files a/examples/image8.jpg and /dev/null differ
diff --git a/examples/image9.jpg b/examples/image9.jpg
deleted file mode 100644
index 5c6d9f7cbc1b5f7c1cd344fc2f7c44c056acbb65..0000000000000000000000000000000000000000
Binary files a/examples/image9.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_00.jpg b/examples/r000228/r000228_00.jpg
deleted file mode 100644
index 0e687e6fd49e7c665d8e2ffcfff9c5b632163ff8..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_00.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_01.jpg b/examples/r000228/r000228_01.jpg
deleted file mode 100644
index b763236da67dbd67bfa2861ff88b11103422c768..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_01.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_02.jpg b/examples/r000228/r000228_02.jpg
deleted file mode 100644
index 3e7ab44b728ea35209147944b7303c20cc7afe7e..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_02.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_03.jpg b/examples/r000228/r000228_03.jpg
deleted file mode 100644
index c47da40547e3bc52afa3f4f609c2e8f3e6a10c17..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_03.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_04.jpg b/examples/r000228/r000228_04.jpg
deleted file mode 100644
index 527fa1540c8f9c9004c07f3c5ad11495c838dc63..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_04.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_05.jpg b/examples/r000228/r000228_05.jpg
deleted file mode 100644
index e66f96ebb0e18a1f306456b52665f24a38e8ee50..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_05.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_06.jpg b/examples/r000228/r000228_06.jpg
deleted file mode 100644
index e5fa8947bbb0a883d425a6a1d990357ec4a50ef3..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_06.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_07.jpg b/examples/r000228/r000228_07.jpg
deleted file mode 100644
index 5656036183ec0a57e6b5985d90491ff33102b37b..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_07.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_08.jpg b/examples/r000228/r000228_08.jpg
deleted file mode 100644
index 345db7bc55d7ade809e859338391e2c29a310414..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_08.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_09.jpg b/examples/r000228/r000228_09.jpg
deleted file mode 100644
index fb7780f9b66857aac43225a26868da39d56082aa..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_09.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_10.jpg b/examples/r000228/r000228_10.jpg
deleted file mode 100644
index 17b1aa0ac42e6874a265de9df11557998bed3d3c..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_10.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_11.jpg b/examples/r000228/r000228_11.jpg
deleted file mode 100644
index f5689c5b514bc38755bee8bd0f2a2678dd907c78..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_11.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_12.jpg b/examples/r000228/r000228_12.jpg
deleted file mode 100644
index b2b529437a34927a85dc5700aa9ccdda971cf816..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_12.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_13.jpg b/examples/r000228/r000228_13.jpg
deleted file mode 100644
index 652dd2693dbb821c6082cc2199f8faafffd75ff9..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_13.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_14.jpg b/examples/r000228/r000228_14.jpg
deleted file mode 100644
index a32f84ded89a05ba54ab6d709d327abace17343a..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_14.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_15.jpg b/examples/r000228/r000228_15.jpg
deleted file mode 100644
index 9f6e5476da33978430cae95fc77eb0f609883113..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_15.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_16.jpg b/examples/r000228/r000228_16.jpg
deleted file mode 100644
index 88696ddfcf1001d4e740b483c730d1eef3521a7b..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_16.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_17.jpg b/examples/r000228/r000228_17.jpg
deleted file mode 100644
index 2144fe53a1ae0c92a5644a187deacf117bb28db0..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_17.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_18.jpg b/examples/r000228/r000228_18.jpg
deleted file mode 100644
index 8b225421e11ae8deb222940981605d3ebb7a366a..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_18.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_19.jpg b/examples/r000228/r000228_19.jpg
deleted file mode 100644
index 8b225421e11ae8deb222940981605d3ebb7a366a..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_19.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_20.jpg b/examples/r000228/r000228_20.jpg
deleted file mode 100644
index 3d3c85f139288158eb1934b229b3c1626aa2ed2a..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_20.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_21.jpg b/examples/r000228/r000228_21.jpg
deleted file mode 100644
index fd7172d2cf6911ff7dd50047ef5bf6ecb6ca918b..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_21.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_22.jpg b/examples/r000228/r000228_22.jpg
deleted file mode 100644
index 78ecc6bf697a9a40317e49f696ea2a954c3f5ce9..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_22.jpg and /dev/null differ
diff --git a/examples/r000228/r000228_23.jpg b/examples/r000228/r000228_23.jpg
deleted file mode 100644
index 692c7a19efe5af334324758f8e65577e8223278a..0000000000000000000000000000000000000000
Binary files a/examples/r000228/r000228_23.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_00.jpg b/examples/r000688/r000688_00.jpg
deleted file mode 100644
index 1ae1be7179e04277d0186379b9fea55750a0f640..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_00.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_01.jpg b/examples/r000688/r000688_01.jpg
deleted file mode 100644
index b7aff773c8ece99e841f232b882e7ecc455a4807..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_01.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_02.jpg b/examples/r000688/r000688_02.jpg
deleted file mode 100644
index 6b61cea8b9bf3b908064a6d18215057fa819ef5f..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_02.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_03.jpg b/examples/r000688/r000688_03.jpg
deleted file mode 100644
index e9df8bbb673951acca4ff4799841c122d0de2b32..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_03.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_04.jpg b/examples/r000688/r000688_04.jpg
deleted file mode 100644
index 3306071f41cd693a697e674cf7620183b17d3e8b..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_04.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_05.jpg b/examples/r000688/r000688_05.jpg
deleted file mode 100644
index ef6f622b4e610f65cf22e79315520487d9c2d6bc..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_05.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_06.jpg b/examples/r000688/r000688_06.jpg
deleted file mode 100644
index 02dd435ddc02b072111d28da67d8d24199f2c8b6..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_06.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_07.jpg b/examples/r000688/r000688_07.jpg
deleted file mode 100644
index 7067bdb885ba634eeb7427b4a0fe73d527f28d45..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_07.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_08.jpg b/examples/r000688/r000688_08.jpg
deleted file mode 100644
index a8485dade12462184bd0c2bade6d73a20596876f..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_08.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_09.jpg b/examples/r000688/r000688_09.jpg
deleted file mode 100644
index c7ce9d982bf34c1601ba7cac889d639756f91e2e..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_09.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_10.jpg b/examples/r000688/r000688_10.jpg
deleted file mode 100644
index 40f505d80d3864c49dd2544fdee5a85ce6653b2b..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_10.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_11.jpg b/examples/r000688/r000688_11.jpg
deleted file mode 100644
index 67609055c5e8bb520de73548a12c0f8a219068fc..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_11.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_12.jpg b/examples/r000688/r000688_12.jpg
deleted file mode 100644
index 67e79ff5d998f0eea7942cb61973e146cf654afe..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_12.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_13.jpg b/examples/r000688/r000688_13.jpg
deleted file mode 100644
index eb561e67ac80edf200548aee75131793267ec851..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_13.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_14.jpg b/examples/r000688/r000688_14.jpg
deleted file mode 100644
index ddabe504ef602bd4576778e48df09b43ad43df96..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_14.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_15.jpg b/examples/r000688/r000688_15.jpg
deleted file mode 100644
index ec3a053f8a180a6e03e21a74251dd8e32a720c81..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_15.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_16.jpg b/examples/r000688/r000688_16.jpg
deleted file mode 100644
index 1b2bc462a53529f01260a9b030304a83655cdf33..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_16.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_17.jpg b/examples/r000688/r000688_17.jpg
deleted file mode 100644
index eee750404f37e0ff7d969ea7a3acb67231336368..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_17.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_18.jpg b/examples/r000688/r000688_18.jpg
deleted file mode 100644
index ae3553e8fd9f1ee8497ddbd04f4d11e78f0023b3..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_18.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_19.jpg b/examples/r000688/r000688_19.jpg
deleted file mode 100644
index 4f0367d0704afb8b527eef6a1621209d0384e8a3..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_19.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_20.jpg b/examples/r000688/r000688_20.jpg
deleted file mode 100644
index 4942cab5b3fac0a3a64c9c1db2750cdd68d1852e..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_20.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_21.jpg b/examples/r000688/r000688_21.jpg
deleted file mode 100644
index b4aeb086cdce720ac9fc48a5969955ab1d7c4a65..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_21.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_22.jpg b/examples/r000688/r000688_22.jpg
deleted file mode 100644
index 9025112b7cbb43699fd17431054e08b436cdf755..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_22.jpg and /dev/null differ
diff --git a/examples/r000688/r000688_23.jpg b/examples/r000688/r000688_23.jpg
deleted file mode 100644
index 3dc33bc1f6178a3e406f28a80537ba37fdf52a66..0000000000000000000000000000000000000000
Binary files a/examples/r000688/r000688_23.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_00.jpg b/examples/r001599/r001599_00.jpg
deleted file mode 100644
index 6e03be188bbefb996046b5888d805e7f57c60140..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_00.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_01.jpg b/examples/r001599/r001599_01.jpg
deleted file mode 100644
index 40665d0ad473f9dbfbc35dbb31cd942842bb4efd..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_01.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_02.jpg b/examples/r001599/r001599_02.jpg
deleted file mode 100644
index f27c254c414e025c702d75902bdc074436746575..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_02.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_03.jpg b/examples/r001599/r001599_03.jpg
deleted file mode 100644
index 6bc3dfd8c0a501f9d61f9bdb6249b0c7226ac206..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_03.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_04.jpg b/examples/r001599/r001599_04.jpg
deleted file mode 100644
index 718dcdb1c1e44f9e2393f46760d3ae2a54b507c2..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_04.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_05.jpg b/examples/r001599/r001599_05.jpg
deleted file mode 100644
index dda4f9dc883e141a5b7148fd3f2f1314eb70e88b..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_05.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_06.jpg b/examples/r001599/r001599_06.jpg
deleted file mode 100644
index a7781b74d34bfc9a7c00e8f62784fe4412eb320f..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_06.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_07.jpg b/examples/r001599/r001599_07.jpg
deleted file mode 100644
index 90230ba73ffe67de3ce515a0ba892bbd29f413f2..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_07.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_08.jpg b/examples/r001599/r001599_08.jpg
deleted file mode 100644
index d0af6f91d5867e22f584358c01b0886934941ee6..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_08.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_09.jpg b/examples/r001599/r001599_09.jpg
deleted file mode 100644
index 29a6de2c47b049c35d0e479d800a25c82b5da7a4..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_09.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_10.jpg b/examples/r001599/r001599_10.jpg
deleted file mode 100644
index b034a17051ac03f6c048a9fdffb42a74424d6bc6..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_10.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_11.jpg b/examples/r001599/r001599_11.jpg
deleted file mode 100644
index 1a103b3526d01ab6246f4a67c3678e6a8146e18a..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_11.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_12.jpg b/examples/r001599/r001599_12.jpg
deleted file mode 100644
index 83962c006d09afd6c38cbc5b22105f31d8e9657d..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_12.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_13.jpg b/examples/r001599/r001599_13.jpg
deleted file mode 100644
index b623d74f904277c615d64d8196f8bd1dec97244b..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_13.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_14.jpg b/examples/r001599/r001599_14.jpg
deleted file mode 100644
index bff53c0164c45ab163bf9f6e3098914b21f97311..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_14.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_15.jpg b/examples/r001599/r001599_15.jpg
deleted file mode 100644
index aee062ebcdbcd84f726c778cf29e823881976193..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_15.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_16.jpg b/examples/r001599/r001599_16.jpg
deleted file mode 100644
index 0dbda401270bee11c7287792f3dbb943f45fb7db..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_16.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_17.jpg b/examples/r001599/r001599_17.jpg
deleted file mode 100644
index 6f8c83bf45fe8bf98808d3d0d50bc4412061177e..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_17.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_18.jpg b/examples/r001599/r001599_18.jpg
deleted file mode 100644
index cbf5091d6e30c002b159c9a95b9a348b9903bc01..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_18.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_19.jpg b/examples/r001599/r001599_19.jpg
deleted file mode 100644
index a27c7d31eaf0caca881fc86e0b978d8d050c1c10..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_19.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_20.jpg b/examples/r001599/r001599_20.jpg
deleted file mode 100644
index f2541b5e8df46719c2f700a9e90a191098e55a98..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_20.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_21.jpg b/examples/r001599/r001599_21.jpg
deleted file mode 100644
index 72f00348c4311c4723a35e0d03d97ecc598e3b9f..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_21.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_22.jpg b/examples/r001599/r001599_22.jpg
deleted file mode 100644
index fe4e6e4a97eda6dcaa123b418d9a381b4a99c2d4..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_22.jpg and /dev/null differ
diff --git a/examples/r001599/r001599_23.jpg b/examples/r001599/r001599_23.jpg
deleted file mode 100644
index f8011803759d35de3444646b5fcfd29f60992650..0000000000000000000000000000000000000000
Binary files a/examples/r001599/r001599_23.jpg and /dev/null differ
diff --git a/examples/r003679.mp4 b/examples/r003679.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5c779e1b6e3f085244726e5217384a361a77c045
Binary files /dev/null and b/examples/r003679.mp4 differ
diff --git a/examples/r004061.mp4 b/examples/r004061.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..21cf2a8f16d4a7fc2d7638cc8cd5762647faf54f
Binary files /dev/null and b/examples/r004061.mp4 differ
diff --git a/examples/r100887/r100887_00.jpg b/examples/r100887/r100887_00.jpg
deleted file mode 100644
index fa7322b0e4d079b499da51224e752d1cb4440129..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_00.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_01.jpg b/examples/r100887/r100887_01.jpg
deleted file mode 100644
index c9fbbeadbdc97c4c9ecfbb075cb91793ce49d32b..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_01.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_02.jpg b/examples/r100887/r100887_02.jpg
deleted file mode 100644
index 38e23cd7b02a9c6b278a7b45a0e45e559d8e39aa..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_02.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_03.jpg b/examples/r100887/r100887_03.jpg
deleted file mode 100644
index 18d24d9fa68946f576091fc88f704e66060d32d0..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_03.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_04.jpg b/examples/r100887/r100887_04.jpg
deleted file mode 100644
index 3428524c2ae23a1c17fba6b4e362b801578a69ae..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_04.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_05.jpg b/examples/r100887/r100887_05.jpg
deleted file mode 100644
index 17db391eae533c47732ea4409ae630df36345ec4..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_05.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_06.jpg b/examples/r100887/r100887_06.jpg
deleted file mode 100644
index d52e1b0e32dd7d31d4235c89f9b562f2e9921121..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_06.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_07.jpg b/examples/r100887/r100887_07.jpg
deleted file mode 100644
index 0dbdb1060942e35ad0b542b4bcca6ed6a40bcb51..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_07.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_08.jpg b/examples/r100887/r100887_08.jpg
deleted file mode 100644
index a0421349d5a60bdff8bb7ad2ca3ed91ecb788297..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_08.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_09.jpg b/examples/r100887/r100887_09.jpg
deleted file mode 100644
index 686ccbc277518d8787c33df151088adfcc6e8921..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_09.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_10.jpg b/examples/r100887/r100887_10.jpg
deleted file mode 100644
index f8b8cdf743ffefccc426e6fb0d6656cd38945126..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_10.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_11.jpg b/examples/r100887/r100887_11.jpg
deleted file mode 100644
index b3578b655a793316f695f14e2e65b0c791dddab7..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_11.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_12.jpg b/examples/r100887/r100887_12.jpg
deleted file mode 100644
index 49b0daf750efdf824d60476a5ca0c893803bd57e..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_12.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_13.jpg b/examples/r100887/r100887_13.jpg
deleted file mode 100644
index ac00079d79e40c6a86f300c6328926c11e949a23..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_13.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_14.jpg b/examples/r100887/r100887_14.jpg
deleted file mode 100644
index c2d8f2cdaf1146f40c81de00202c5d60a10bf1fa..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_14.jpg and /dev/null differ
diff --git a/examples/r100887/r100887_15.jpg b/examples/r100887/r100887_15.jpg
deleted file mode 100644
index 3438a0864add23c65863d9c66736245082ab017d..0000000000000000000000000000000000000000
Binary files a/examples/r100887/r100887_15.jpg and /dev/null differ
diff --git a/examples/r100916.mp4 b/examples/r100916.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0f521665ba2d80011c23ed5f72d2c621a5c9db94
Binary files /dev/null and b/examples/r100916.mp4 differ
diff --git a/examples/r101298/r101298_00.jpg b/examples/r101298/r101298_00.jpg
deleted file mode 100644
index 8dbae3287d592a08ab4f30da07173005269d235b..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_00.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_01.jpg b/examples/r101298/r101298_01.jpg
deleted file mode 100644
index 5315559f664be3bbbfe015c2459913a7ec4b2e60..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_01.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_02.jpg b/examples/r101298/r101298_02.jpg
deleted file mode 100644
index adc59fa0cadca754d6563b19521d169496dae1b7..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_02.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_03.jpg b/examples/r101298/r101298_03.jpg
deleted file mode 100644
index 0f4f030f464f5a73468962de17b696f75747b852..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_03.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_04.jpg b/examples/r101298/r101298_04.jpg
deleted file mode 100644
index 3b89aa1bb13202f30b150abb5a767d77d41b9985..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_04.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_05.jpg b/examples/r101298/r101298_05.jpg
deleted file mode 100644
index 4a9676a43555bdf747e6153539edb70410c02447..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_05.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_06.jpg b/examples/r101298/r101298_06.jpg
deleted file mode 100644
index dbe3eaa5e7fa62199082b2fa64ed2e2e9c084dd1..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_06.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_07.jpg b/examples/r101298/r101298_07.jpg
deleted file mode 100644
index 33c99826df868455d7217e394c679658e09c222e..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_07.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_08.jpg b/examples/r101298/r101298_08.jpg
deleted file mode 100644
index f48ef9e51fbf8453fb0085922481739d57ce4620..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_08.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_09.jpg b/examples/r101298/r101298_09.jpg
deleted file mode 100644
index 8f9f91f432b9ab65cb624067441ed948deca7ed3..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_09.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_10.jpg b/examples/r101298/r101298_10.jpg
deleted file mode 100644
index a026bcb866a9f09cf874663ec43cf40801687cea..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_10.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_11.jpg b/examples/r101298/r101298_11.jpg
deleted file mode 100644
index 49ec40ff3988be2a795e110ffce89a9239943588..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_11.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_12.jpg b/examples/r101298/r101298_12.jpg
deleted file mode 100644
index 1ff5f772ee5e8bc38f7ce08d5bfb0d1b4e7869f4..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_12.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_13.jpg b/examples/r101298/r101298_13.jpg
deleted file mode 100644
index b69d0ce8d479ed1d40095f22b16977152a0be097..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_13.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_14.jpg b/examples/r101298/r101298_14.jpg
deleted file mode 100644
index 1b53000b2b60bd4a607929a2f930df36dc9b2945..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_14.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_15.jpg b/examples/r101298/r101298_15.jpg
deleted file mode 100644
index d5ae9b79ac53b84691035128a1a407ba551bc992..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_15.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_16.jpg b/examples/r101298/r101298_16.jpg
deleted file mode 100644
index d8b01d50d23b1ee7a3b3e9fbefdcebb3680f6b41..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_16.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_17.jpg b/examples/r101298/r101298_17.jpg
deleted file mode 100644
index 65e0ba7427b9193a55527ca90a8266d9eca7c8d6..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_17.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_18.jpg b/examples/r101298/r101298_18.jpg
deleted file mode 100644
index c99d22a030a43cf76c18045156d6b835729b06ad..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_18.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_19.jpg b/examples/r101298/r101298_19.jpg
deleted file mode 100644
index f797fc0ab45e15f9fb49b257e1960ff7ba33ee21..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_19.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_20.jpg b/examples/r101298/r101298_20.jpg
deleted file mode 100644
index 10a358df3d64a543797b326497ed462b4493304d..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_20.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_21.jpg b/examples/r101298/r101298_21.jpg
deleted file mode 100644
index 7970c454e80b32b7dc18865dcd9a6c8e65ca5001..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_21.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_22.jpg b/examples/r101298/r101298_22.jpg
deleted file mode 100644
index b64fcf7d2198090871be75212bfc400b7e506e74..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_22.jpg and /dev/null differ
diff --git a/examples/r101298/r101298_23.jpg b/examples/r101298/r101298_23.jpg
deleted file mode 100644
index bff859027f32ccc42a562659b4b38c6570b55034..0000000000000000000000000000000000000000
Binary files a/examples/r101298/r101298_23.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_00.jpg b/examples/r104340/r104340_00.jpg
deleted file mode 100644
index 07662d7dd7a3429be89a3500366301b953278582..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_00.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_01.jpg b/examples/r104340/r104340_01.jpg
deleted file mode 100644
index 60e073ca92b4a4b80a9bbce5ca722919c34194aa..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_01.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_02.jpg b/examples/r104340/r104340_02.jpg
deleted file mode 100644
index 4d60efec93bf453f6171d21a32a8c60074587b2a..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_02.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_03.jpg b/examples/r104340/r104340_03.jpg
deleted file mode 100644
index 0a390de6545a80a35440e0b9ce8e4c7f798eae2d..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_03.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_04.jpg b/examples/r104340/r104340_04.jpg
deleted file mode 100644
index c9d12bcfdcee6bf22e582ab4341a3ef82e0a695e..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_04.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_05.jpg b/examples/r104340/r104340_05.jpg
deleted file mode 100644
index 9595eec1a9c75ad170ac5357b9e9cdfdc315ad05..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_05.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_06.jpg b/examples/r104340/r104340_06.jpg
deleted file mode 100644
index 00f18062c77034352358f91aa7e4a5c00128d867..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_06.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_07.jpg b/examples/r104340/r104340_07.jpg
deleted file mode 100644
index 8f34307fc2113e9601b13bc770320bda3bc5e806..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_07.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_08.jpg b/examples/r104340/r104340_08.jpg
deleted file mode 100644
index 52029e973e306d82291e76e694ad7059428800b2..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_08.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_09.jpg b/examples/r104340/r104340_09.jpg
deleted file mode 100644
index 263ceea7bb9a396f79c71234df525e2cfe95027d..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_09.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_10.jpg b/examples/r104340/r104340_10.jpg
deleted file mode 100644
index bb58df9eabb97d2f49b9a55e38966c11b99dd549..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_10.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_11.jpg b/examples/r104340/r104340_11.jpg
deleted file mode 100644
index 99e688969de566a36a654bbc53086e825dc935b7..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_11.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_12.jpg b/examples/r104340/r104340_12.jpg
deleted file mode 100644
index 8a85e2bbd7ea3c1708cea0573b99903d3ddbc832..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_12.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_13.jpg b/examples/r104340/r104340_13.jpg
deleted file mode 100644
index 76c750c9d8ff11fa14b91ac39587f56e7ab33c1e..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_13.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_14.jpg b/examples/r104340/r104340_14.jpg
deleted file mode 100644
index 5e390c0ab9a525e43e30919d0b556a20a65c3d43..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_14.jpg and /dev/null differ
diff --git a/examples/r104340/r104340_15.jpg b/examples/r104340/r104340_15.jpg
deleted file mode 100644
index 9a6bddc5c50ed2e0c4e3704b11c949c5de2d673f..0000000000000000000000000000000000000000
Binary files a/examples/r104340/r104340_15.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_00.jpg b/examples/r104350/r104350_00.jpg
deleted file mode 100644
index 9cbd97e1c34b124c906df3091f66d296599f7b17..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_00.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_01.jpg b/examples/r104350/r104350_01.jpg
deleted file mode 100644
index 1718d51b237f02d8b152b6bd6473178ea298517d..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_01.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_02.jpg b/examples/r104350/r104350_02.jpg
deleted file mode 100644
index f4a6a5db6de48721fafae2ae830bffb5c6013b0d..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_02.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_03.jpg b/examples/r104350/r104350_03.jpg
deleted file mode 100644
index decb630b3d1e046678811aa43ea29f9b55b847c5..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_03.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_04.jpg b/examples/r104350/r104350_04.jpg
deleted file mode 100644
index f356c8e15ea631266a45baffcc7ee83469a967ec..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_04.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_05.jpg b/examples/r104350/r104350_05.jpg
deleted file mode 100644
index b07b2250bfd0fb8a1d0cf78cd74df4910b7b6fa8..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_05.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_06.jpg b/examples/r104350/r104350_06.jpg
deleted file mode 100644
index feb0c89770542bc5f1d055f522a9d66a737ad3c6..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_06.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_07.jpg b/examples/r104350/r104350_07.jpg
deleted file mode 100644
index d4b4dd8dd293f3862b59616e5987d796bc8c6718..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_07.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_08.jpg b/examples/r104350/r104350_08.jpg
deleted file mode 100644
index ba1b1b168e17a2678855ac94245063c225fa0da1..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_08.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_09.jpg b/examples/r104350/r104350_09.jpg
deleted file mode 100644
index 7e9fb3103341979c2ec56f6e8b558b132aa918c8..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_09.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_10.jpg b/examples/r104350/r104350_10.jpg
deleted file mode 100644
index aa9dd28b7da8d61c2ef8ba8d2a529eb947577c1f..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_10.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_11.jpg b/examples/r104350/r104350_11.jpg
deleted file mode 100644
index 50abdedb54eb74e2441a2d80168b79d6c7405c31..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_11.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_12.jpg b/examples/r104350/r104350_12.jpg
deleted file mode 100644
index fd7dacacc7ba28e1567c11ba8181621618d477bd..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_12.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_13.jpg b/examples/r104350/r104350_13.jpg
deleted file mode 100644
index 2d95af3ceb69467dd677f51073a90b89dbee7a4e..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_13.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_14.jpg b/examples/r104350/r104350_14.jpg
deleted file mode 100644
index 64e7d328cbb9094aaad63d7a5678ed5dc6386a8a..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_14.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_15.jpg b/examples/r104350/r104350_15.jpg
deleted file mode 100644
index dbf2778160da9fc45cd045c2a5b2d0052be111d8..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_15.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_16.jpg b/examples/r104350/r104350_16.jpg
deleted file mode 100644
index 8f73f9637a35b65c9e5ebcb6eedd89f11a66ba5f..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_16.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_17.jpg b/examples/r104350/r104350_17.jpg
deleted file mode 100644
index 57fc6864eb7e7c57f4236960f84d94e7c49e1fa3..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_17.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_18.jpg b/examples/r104350/r104350_18.jpg
deleted file mode 100644
index becd48023aea4e016037c5cfebfa884cc153f61f..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_18.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_19.jpg b/examples/r104350/r104350_19.jpg
deleted file mode 100644
index bc761a4f14650e38ef8486c9ca242417688c124f..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_19.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_20.jpg b/examples/r104350/r104350_20.jpg
deleted file mode 100644
index 694a61a211779d13597ca73dc639d0607be9ed37..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_20.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_21.jpg b/examples/r104350/r104350_21.jpg
deleted file mode 100644
index 11bbac062bdc1c422bceea9788a710f37c1be983..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_21.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_22.jpg b/examples/r104350/r104350_22.jpg
deleted file mode 100644
index d4e4652f0b8c28500fe94872746a0ac262fed08a..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_22.jpg and /dev/null differ
diff --git a/examples/r104350/r104350_23.jpg b/examples/r104350/r104350_23.jpg
deleted file mode 100644
index c555fd0f4fb94533f699a12fe14cd56db8f6382b..0000000000000000000000000000000000000000
Binary files a/examples/r104350/r104350_23.jpg and /dev/null differ
diff --git a/examples/test.ipynb b/examples/test.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..f136efb3c6f538d7148d192f175b08bda2cc1d6a
--- /dev/null
+++ b/examples/test.ipynb
@@ -0,0 +1,560 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'path/to/your/images'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 31\u001b[0m\n\u001b[1;32m     28\u001b[0m output_video \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124moutput_video.mp4\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m     29\u001b[0m frame_rate \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m30\u001b[39m  \u001b[38;5;66;03m# Frames per second\u001b[39;00m\n\u001b[0;32m---> 31\u001b[0m \u001b[43mcreate_video_from_frames\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_directory\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_video\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mframe_rate\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[1], line 6\u001b[0m, in \u001b[0;36mcreate_video_from_frames\u001b[0;34m(input_dir, output_file, frame_rate)\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcreate_video_from_frames\u001b[39m(input_dir, output_file, frame_rate):\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;66;03m# Get all image files from the input directory\u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m     images \u001b[38;5;241m=\u001b[39m [img \u001b[38;5;28;01mfor\u001b[39;00m img \u001b[38;5;129;01min\u001b[39;00m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_dir\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m img\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.png\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m img\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.jpg\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[1;32m      7\u001b[0m     images\u001b[38;5;241m.\u001b[39msort()  \u001b[38;5;66;03m# Ensure the images are sorted\u001b[39;00m\n\u001b[1;32m      9\u001b[0m     \u001b[38;5;66;03m# Read the first image to get the dimensions\u001b[39;00m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'path/to/your/images'"
+     ]
+    }
+   ],
+   "source": [
+    "import cv2\n",
+    "import os\n",
+    "\n",
+    "def create_video_from_frames(input_dir, output_file, frame_rate):\n",
+    "    # Get all image files from the input directory\n",
+    "    images = [img for img in os.listdir(input_dir) if img.endswith(\".png\") or img.endswith(\".jpg\")]\n",
+    "    images.sort()  # Ensure the images are sorted\n",
+    "\n",
+    "    # Read the first image to get the dimensions\n",
+    "    frame = cv2.imread(os.path.join(input_dir, images[0]))\n",
+    "    height, width, layers = frame.shape\n",
+    "\n",
+    "    # Initialize the video writer\n",
+    "    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for .mp4 file\n",
+    "    video = cv2.VideoWriter(output_file, fourcc, frame_rate, (width, height))\n",
+    "\n",
+    "    # Write each image to the video\n",
+    "    for image in images:\n",
+    "        img_path = os.path.join(input_dir, image)\n",
+    "        frame = cv2.imread(img_path)\n",
+    "        video.write(frame)\n",
+    "\n",
+    "    # Release the video writer\n",
+    "    video.release()\n",
+    "\n",
+    "# Example usage\n",
+    "input_directory = 'path/to/your/images'\n",
+    "output_video = 'output_video.mp4'\n",
+    "frame_rate = 30  # Frames per second\n",
+    "\n",
+    "create_video_from_frames(input_directory, output_video, frame_rate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open(\"./all_subsets.json\", 'r') as f:\n",
+    "    examples = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "15"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(examples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'r003679_00.jpg'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "examples[0]['images'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dog rolls over and falls off couch\n",
+      "Created video for r003679\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[ERROR:0@119.424] global cap_ffmpeg_impl.hpp:3130 open Could not find encoder for codec_id=27, error: Encoder not found\n",
+      "[ERROR:0@119.424] global cap_ffmpeg_impl.hpp:3208 open VIDEOIO/FFMPEG: Failed to initialize VideoWriter\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "import os\n",
+    "import cv2\n",
+    "import regex as re\n",
+    "import shutil\n",
+    "for item in examples:\n",
+    "    video_id = item['images'][0].split(\"_\")[0]\n",
+    "    images = [os.path.join(video_id, x) for x in item['images']]\n",
+    "    output_file = f\"{video_id}.mp4\"\n",
+    "    frame = cv2.imread(images[0])\n",
+    "    if frame is None:\n",
+    "        print(f\"Skipping {video_id}\")\n",
+    "        continue\n",
+    "    height, width, layers = frame.shape\n",
+    "    item['video'] = output_file\n",
+    "    \n",
+    "    pattern = r'(?<=For this item, the text prompt is ).+(?=,)'\n",
+    "    prompt = re.findall(pattern, item['conversations'][0]['value'])[0]\n",
+    "    print(prompt)\n",
+    "    item['prompt'] = prompt.strip()\n",
+    "    \n",
+    "    # Initialize the video writer\n",
+    "    # fourcc = cv2.VideoWriter_fourcc(*'H264')   # Codec for .mp4 file|\n",
+    "    fourcc = cv2.VideoWriter_fourcc(*'avc3')   # Codec for .mp4 file|\n",
+    "    frame_rate = 8\n",
+    "    video = cv2.VideoWriter(output_file, fourcc, frame_rate, (width, height))\n",
+    "\n",
+    "    # Write each image to the video\n",
+    "    for image in images:\n",
+    "        frame = cv2.imread(image)\n",
+    "        video.write(frame)\n",
+    "    print(f\"Created video for {video_id}\")\n",
+    "        \n",
+    "    # shutil.rmtree(video_id)\n",
+    "\n",
+    "    # Release the video writer\n",
+    "    video.release()\n",
+    "    break\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"./all_subsets.json\", 'w') as f:\n",
+    "    json.dump(examples, f, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'id': 'real_r003679',\n",
+       "  'images': ['r003679_00.jpg',\n",
+       "   'r003679_01.jpg',\n",
+       "   'r003679_02.jpg',\n",
+       "   'r003679_03.jpg',\n",
+       "   'r003679_04.jpg',\n",
+       "   'r003679_05.jpg',\n",
+       "   'r003679_06.jpg',\n",
+       "   'r003679_07.jpg',\n",
+       "   'r003679_08.jpg',\n",
+       "   'r003679_09.jpg',\n",
+       "   'r003679_10.jpg',\n",
+       "   'r003679_11.jpg',\n",
+       "   'r003679_12.jpg',\n",
+       "   'r003679_13.jpg',\n",
+       "   'r003679_14.jpg',\n",
+       "   'r003679_15.jpg',\n",
+       "   'r003679_16.jpg',\n",
+       "   'r003679_17.jpg',\n",
+       "   'r003679_18.jpg',\n",
+       "   'r003679_19.jpg',\n",
+       "   'r003679_20.jpg',\n",
+       "   'r003679_21.jpg',\n",
+       "   'r003679_22.jpg',\n",
+       "   'r003679_23.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is dog rolls over and falls off couch,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 3\\n dynamic degree: 3\\n motion smoothness: 3\\n text-to-video alignment: 3\\n factual consistency: 3\\n overall score: 3\\n'}]},\n",
+       " {'id': 'real_r100916',\n",
+       "  'images': ['r100916_00.jpg',\n",
+       "   'r100916_01.jpg',\n",
+       "   'r100916_02.jpg',\n",
+       "   'r100916_03.jpg',\n",
+       "   'r100916_04.jpg',\n",
+       "   'r100916_05.jpg',\n",
+       "   'r100916_06.jpg',\n",
+       "   'r100916_07.jpg',\n",
+       "   'r100916_08.jpg',\n",
+       "   'r100916_09.jpg',\n",
+       "   'r100916_10.jpg',\n",
+       "   'r100916_11.jpg',\n",
+       "   'r100916_12.jpg',\n",
+       "   'r100916_13.jpg',\n",
+       "   'r100916_14.jpg',\n",
+       "   'r100916_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is A young man is standing in front of a glacier with mountains in the background.,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 3\\n dynamic degree: 3\\n motion smoothness: 3\\n text-to-video alignment: 3\\n factual consistency: 3\\n overall score: 3\\n'}]},\n",
+       " {'id': 'real_r004061',\n",
+       "  'images': ['r004061_00.jpg',\n",
+       "   'r004061_01.jpg',\n",
+       "   'r004061_02.jpg',\n",
+       "   'r004061_03.jpg',\n",
+       "   'r004061_04.jpg',\n",
+       "   'r004061_05.jpg',\n",
+       "   'r004061_06.jpg',\n",
+       "   'r004061_07.jpg',\n",
+       "   'r004061_08.jpg',\n",
+       "   'r004061_09.jpg',\n",
+       "   'r004061_10.jpg',\n",
+       "   'r004061_11.jpg',\n",
+       "   'r004061_12.jpg',\n",
+       "   'r004061_13.jpg',\n",
+       "   'r004061_14.jpg',\n",
+       "   'r004061_15.jpg',\n",
+       "   'r004061_16.jpg',\n",
+       "   'r004061_17.jpg',\n",
+       "   'r004061_18.jpg',\n",
+       "   'r004061_19.jpg',\n",
+       "   'r004061_20.jpg',\n",
+       "   'r004061_21.jpg',\n",
+       "   'r004061_22.jpg',\n",
+       "   'r004061_23.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is a man wearing a striped blanket over his backpack runs behind two other men.,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 3\\n dynamic degree: 3\\n motion smoothness: 3\\n text-to-video alignment: 3\\n factual consistency: 3\\n overall score: 3\\n'}]},\n",
+       " {'id': 'worsen_gen_b404675',\n",
+       "  'images': ['b404675_00.jpg',\n",
+       "   'b404675_01.jpg',\n",
+       "   'b404675_02.jpg',\n",
+       "   'b404675_03.jpg',\n",
+       "   'b404675_04.jpg',\n",
+       "   'b404675_05.jpg',\n",
+       "   'b404675_06.jpg',\n",
+       "   'b404675_07.jpg',\n",
+       "   'b404675_08.jpg',\n",
+       "   'b404675_09.jpg',\n",
+       "   'b404675_10.jpg',\n",
+       "   'b404675_11.jpg',\n",
+       "   'b404675_12.jpg',\n",
+       "   'b404675_13.jpg',\n",
+       "   'b404675_14.jpg',\n",
+       "   'b404675_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is After a few years, Ghulam Muhammad became the owner of his business  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 1\\n dynamic degree: 3\\n motion smoothness: 3\\n text-to-video alignment: 2\\n factual consistency: 3\\n overall score: 1\\n'}]},\n",
+       " {'id': 'worsen_gen_b402727',\n",
+       "  'images': ['b402727_00.jpg',\n",
+       "   'b402727_01.jpg',\n",
+       "   'b402727_02.jpg',\n",
+       "   'b402727_03.jpg',\n",
+       "   'b402727_04.jpg',\n",
+       "   'b402727_05.jpg',\n",
+       "   'b402727_06.jpg',\n",
+       "   'b402727_07.jpg',\n",
+       "   'b402727_08.jpg',\n",
+       "   'b402727_09.jpg',\n",
+       "   'b402727_10.jpg',\n",
+       "   'b402727_11.jpg',\n",
+       "   'b402727_12.jpg',\n",
+       "   'b402727_13.jpg',\n",
+       "   'b402727_14.jpg',\n",
+       "   'b402727_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is view down to the path in the field, moving forward, night, dark.  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 1\\n dynamic degree: 3\\n motion smoothness: 3\\n text-to-video alignment: 3\\n factual consistency: 3\\n overall score: 1\\n'}]},\n",
+       " {'id': 'worsen_gen_b304986',\n",
+       "  'images': ['b304986_00.jpg',\n",
+       "   'b304986_01.jpg',\n",
+       "   'b304986_02.jpg',\n",
+       "   'b304986_03.jpg',\n",
+       "   'b304986_04.jpg',\n",
+       "   'b304986_05.jpg',\n",
+       "   'b304986_06.jpg',\n",
+       "   'b304986_07.jpg',\n",
+       "   'b304986_08.jpg',\n",
+       "   'b304986_09.jpg',\n",
+       "   'b304986_10.jpg',\n",
+       "   'b304986_11.jpg',\n",
+       "   'b304986_12.jpg',\n",
+       "   'b304986_13.jpg',\n",
+       "   'b304986_14.jpg',\n",
+       "   'b304986_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is Old Philippino man, close up, subtle smile, teats of joy, in old wood room  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 1\\n dynamic degree: 3\\n motion smoothness: 1\\n text-to-video alignment: 2\\n factual consistency: 3\\n overall score: 1\\n'}]},\n",
+       " {'id': 'static_d500937',\n",
+       "  'images': ['d500937_00.jpg',\n",
+       "   'd500937_01.jpg',\n",
+       "   'd500937_02.jpg',\n",
+       "   'd500937_03.jpg',\n",
+       "   'd500937_04.jpg',\n",
+       "   'd500937_05.jpg',\n",
+       "   'd500937_06.jpg',\n",
+       "   'd500937_07.jpg',\n",
+       "   'd500937_08.jpg',\n",
+       "   'd500937_09.jpg',\n",
+       "   'd500937_10.jpg',\n",
+       "   'd500937_11.jpg',\n",
+       "   'd500937_12.jpg',\n",
+       "   'd500937_13.jpg',\n",
+       "   'd500937_14.jpg',\n",
+       "   'd500937_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is write y in the garden with flowers  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 3\\n dynamic degree: 1\\n motion smoothness: 1\\n text-to-video alignment: 1\\n factual consistency: 1\\n overall score: 1\\n'}]},\n",
+       " {'id': 'static_d500506',\n",
+       "  'images': ['d500506_00.jpg',\n",
+       "   'd500506_01.jpg',\n",
+       "   'd500506_02.jpg',\n",
+       "   'd500506_03.jpg',\n",
+       "   'd500506_04.jpg',\n",
+       "   'd500506_05.jpg',\n",
+       "   'd500506_06.jpg',\n",
+       "   'd500506_07.jpg',\n",
+       "   'd500506_08.jpg',\n",
+       "   'd500506_09.jpg',\n",
+       "   'd500506_10.jpg',\n",
+       "   'd500506_11.jpg',\n",
+       "   'd500506_12.jpg',\n",
+       "   'd500506_13.jpg',\n",
+       "   'd500506_14.jpg',\n",
+       "   'd500506_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is a boy paiting a beautiful landscape  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 3\\n dynamic degree: 1\\n motion smoothness: 1\\n text-to-video alignment: 1\\n factual consistency: 1\\n overall score: 1\\n'}]},\n",
+       " {'id': 'static_d401950',\n",
+       "  'images': ['d401950_00.jpg',\n",
+       "   'd401950_01.jpg',\n",
+       "   'd401950_02.jpg',\n",
+       "   'd401950_03.jpg',\n",
+       "   'd401950_04.jpg',\n",
+       "   'd401950_05.jpg',\n",
+       "   'd401950_06.jpg',\n",
+       "   'd401950_07.jpg',\n",
+       "   'd401950_08.jpg',\n",
+       "   'd401950_09.jpg',\n",
+       "   'd401950_10.jpg',\n",
+       "   'd401950_11.jpg',\n",
+       "   'd401950_12.jpg',\n",
+       "   'd401950_13.jpg',\n",
+       "   'd401950_14.jpg',\n",
+       "   'd401950_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is sneaker with exclusive design worth 1 billion  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 3\\n dynamic degree: 1\\n motion smoothness: 1\\n text-to-video alignment: 1\\n factual consistency: 1\\n overall score: 1\\n'}]},\n",
+       " {'id': 'insf_a500251',\n",
+       "  'images': ['a500251_00.jpg',\n",
+       "   'a500251_01.jpg',\n",
+       "   'a500251_02.jpg',\n",
+       "   'a500251_03.jpg',\n",
+       "   'a500251_04.jpg',\n",
+       "   'a500251_05.jpg',\n",
+       "   'a500251_06.jpg',\n",
+       "   'a500251_07.jpg',\n",
+       "   'a500251_08.jpg',\n",
+       "   'a500251_09.jpg',\n",
+       "   'a500251_10.jpg',\n",
+       "   'a500251_11.jpg',\n",
+       "   'a500251_12.jpg',\n",
+       "   'a500251_13.jpg',\n",
+       "   'a500251_14.jpg',\n",
+       "   'a500251_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is :sci fi war in the moon  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 1\\n dynamic degree: 3\\n motion smoothness: 1\\n text-to-video alignment: 1\\n factual consistency: 1\\n overall score: 1\\n'}]},\n",
+       " {'id': 'insf_a400480',\n",
+       "  'images': ['a400480_00.jpg',\n",
+       "   'a400480_01.jpg',\n",
+       "   'a400480_02.jpg',\n",
+       "   'a400480_03.jpg',\n",
+       "   'a400480_04.jpg',\n",
+       "   'a400480_05.jpg',\n",
+       "   'a400480_06.jpg',\n",
+       "   'a400480_07.jpg',\n",
+       "   'a400480_08.jpg',\n",
+       "   'a400480_09.jpg',\n",
+       "   'a400480_10.jpg',\n",
+       "   'a400480_11.jpg',\n",
+       "   'a400480_12.jpg',\n",
+       "   'a400480_13.jpg',\n",
+       "   'a400480_14.jpg',\n",
+       "   'a400480_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is for a dog accessories brand a welcome image: brand name: hot doggies  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 1\\n dynamic degree: 3\\n motion smoothness: 1\\n text-to-video alignment: 1\\n factual consistency: 1\\n overall score: 1\\n'}]},\n",
+       " {'id': 'insf_a500010',\n",
+       "  'images': ['a500010_00.jpg',\n",
+       "   'a500010_01.jpg',\n",
+       "   'a500010_02.jpg',\n",
+       "   'a500010_03.jpg',\n",
+       "   'a500010_04.jpg',\n",
+       "   'a500010_05.jpg',\n",
+       "   'a500010_06.jpg',\n",
+       "   'a500010_07.jpg',\n",
+       "   'a500010_08.jpg',\n",
+       "   'a500010_09.jpg',\n",
+       "   'a500010_10.jpg',\n",
+       "   'a500010_11.jpg',\n",
+       "   'a500010_12.jpg',\n",
+       "   'a500010_13.jpg',\n",
+       "   'a500010_14.jpg',\n",
+       "   'a500010_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is a druid in the forest cinematic  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 1\\n object consistency: 1\\n dynamic degree: 3\\n motion smoothness: 1\\n text-to-video alignment: 1\\n factual consistency: 1\\n overall score: 1\\n'}]},\n",
+       " {'id': 'lab_3005033',\n",
+       "  'images': ['3005033_00.jpg',\n",
+       "   '3005033_01.jpg',\n",
+       "   '3005033_02.jpg',\n",
+       "   '3005033_03.jpg',\n",
+       "   '3005033_04.jpg',\n",
+       "   '3005033_05.jpg',\n",
+       "   '3005033_06.jpg',\n",
+       "   '3005033_07.jpg',\n",
+       "   '3005033_08.jpg',\n",
+       "   '3005033_09.jpg',\n",
+       "   '3005033_10.jpg',\n",
+       "   '3005033_11.jpg',\n",
+       "   '3005033_12.jpg',\n",
+       "   '3005033_13.jpg',\n",
+       "   '3005033_14.jpg',\n",
+       "   '3005033_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is a robot vacuum traveling through southeast asian countries  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 2\\n dynamic degree: 3\\n motion smoothness: 2\\n text-to-video alignment: 2\\n factual consistency: 3\\n overall score: 2\\n'}]},\n",
+       " {'id': 'lab_7004180',\n",
+       "  'images': ['7004180_00.jpg',\n",
+       "   '7004180_01.jpg',\n",
+       "   '7004180_02.jpg',\n",
+       "   '7004180_03.jpg',\n",
+       "   '7004180_04.jpg',\n",
+       "   '7004180_05.jpg',\n",
+       "   '7004180_06.jpg',\n",
+       "   '7004180_07.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is jiaxuan is the most beautiful ，elegant，charming girl in the world. billions of boys fell in love with her  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 3\\n dynamic degree: 3\\n motion smoothness: 3\\n text-to-video alignment: 3\\n factual consistency: 3\\n overall score: 3\\n'}]},\n",
+       " {'id': 'lab_1006309',\n",
+       "  'images': ['1006309_00.jpg',\n",
+       "   '1006309_01.jpg',\n",
+       "   '1006309_02.jpg',\n",
+       "   '1006309_03.jpg',\n",
+       "   '1006309_04.jpg',\n",
+       "   '1006309_05.jpg',\n",
+       "   '1006309_06.jpg',\n",
+       "   '1006309_07.jpg',\n",
+       "   '1006309_08.jpg',\n",
+       "   '1006309_09.jpg',\n",
+       "   '1006309_10.jpg',\n",
+       "   '1006309_11.jpg',\n",
+       "   '1006309_12.jpg',\n",
+       "   '1006309_13.jpg',\n",
+       "   '1006309_14.jpg',\n",
+       "   '1006309_15.jpg'],\n",
+       "  'conversations': [{'from': 'human',\n",
+       "    'value': \"\\nSuppose you are an expert in judging and evaluating the quality of AI-generated videos, \\nplease watch the following frames of a given video and see the text prompt for generating the video, \\nthen give scores from 7 different dimensions:\\n(1) visual quality, \\n(2) object consistency,\\n(3) dynamic degree,\\n(4) motion smoothness,\\n(5) text-to-video alignment,\\n(6) factual consistency, \\n(7) overall score\\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\\nHere is an output example: \\nvisual quality: 3\\nobject consistency: 2 \\ndynamic degree: 2\\nmotion smoothness: 1\\ntext-to-video alignment: 1\\nfactual consistency: 2\\noverall score: 1\\n\\nFor this item, the text prompt is scary halloween skeletons are having a conversation with each other  ,\\nall the frames of video are as follows: \\n\\n<image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> <image> \\n\"},\n",
+       "   {'from': 'gpt',\n",
+       "    'value': 'visual quality: 3\\n object consistency: 2\\n dynamic degree: 3\\n motion smoothness: 2\\n text-to-video alignment: 3\\n factual consistency: 2\\n overall score: 3\\n'}]}]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "e"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mantis",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
index 7a8160f8644eae148fb2e06a3ba696373c1f32d3..c07df4053288aa449b7898886832a172dd9a9dfe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ Pillow
 gradio
 spaces
 multiprocess
-peft
\ No newline at end of file
+peft
+av
\ No newline at end of file