DongfuJiang commited on
Commit
2827015
1 Parent(s): b8938cc
Files changed (3) hide show
  1. README.md +1 -1
  2. app_generation.py +197 -0
  3. app_high_res.py +16 -18
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.24.0
8
- app_file: app_regression.py
9
  pinned: false
10
  license: apache-2.0
11
  short_description: Multimodal Language Model
 
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.24.0
8
+ app_file: app_generation.py
9
  pinned: false
10
  license: apache-2.0
11
  short_description: Multimodal Language Model
app_generation.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import os
4
+ import time
5
+ import json
6
+ import numpy as np
7
+ import av
8
+ import torch
9
+ from PIL import Image
10
+ import functools
11
+ from transformers import AutoProcessor, AutoConfig
12
+ from models.idefics2 import Idefics2ForSequenceClassification, Idefics2ForConditionalGeneration
13
+ from models.conversation import conv_templates
14
+ from typing import List
15
+
16
+ processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-refined-40k_4096_generation")
17
+ model = Idefics2ForConditionalGeneration.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-refined-40k_4096_generation", torch_dtype=torch.bfloat16).eval()
18
+
19
+ MAX_NUM_FRAMES = 24
20
+ conv_template = conv_templates["idefics_2"]
21
+
22
+ with open("./examples/all_subsets.json", 'r') as f:
23
+ examples = json.load(f)
24
+
25
+ for item in examples:
26
+ video_id = item['images'][0].split("_")[0]
27
+ item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
28
+ item['video'] = os.path.join("./examples", item['video'])
29
+
30
+ with open("./examples/hd.json", 'r') as f:
31
+ hd_examples = json.load(f)
32
+
33
+ for item in hd_examples:
34
+ item['video'] = os.path.join("./examples", item['video'])
35
+
36
+ examples = hd_examples + examples
37
+
38
+ VIDEO_EVAL_PROMPT = """
39
+ Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
40
+ please watch the following frames of a given video and see the text prompt for generating the video,
41
+ then give scores from 5 different dimensions:
42
+ (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
43
+ (2) temporal consistency, the consistency of objects or humans in video
44
+ (3) dynamic degree, the degree of dynamic changes
45
+ (4) text-to-video alignment, the alignment between the text prompt and the video content
46
+ (5) factual consistency, the consistency of the video content with the common-sense and factual knowledge
47
+
48
+ For each dimension, output a number from [1,2,3,4],
49
+ in which '1' means 'Bad', '2' means 'Average', '3' means 'Good',
50
+ '4' means 'Real' or 'Perfect' (the video is like a real video)
51
+ Here is an output example:
52
+ visual quality: 4
53
+ temporal consistency: 4
54
+ dynamic degree: 3
55
+ text-to-video alignment: 1
56
+ factual consistency: 2
57
+
58
+ For this video, the text prompt is "{text_prompt}",
59
+ all the frames of video are as follows:
60
+
61
+ """
62
+
63
+
64
+ aspect_mapping= [
65
+ "visual quality",
66
+ "temporal consistency",
67
+ "dynamic degree",
68
+ "text-to-video alignment",
69
+ "factual consistency",
70
+ ]
71
+
72
+
73
+ @spaces.GPU(duration=60)
74
+ def score(prompt:str, images:List[Image.Image]):
75
+ if not prompt:
76
+ raise gr.Error("Please provide a prompt")
77
+ model.to("cuda")
78
+ if not images:
79
+ images = None
80
+
81
+ flatten_images = []
82
+ for x in images:
83
+ if isinstance(x, list):
84
+ flatten_images.extend(x)
85
+ else:
86
+ flatten_images.append(x)
87
+
88
+ messages = [{"role": "User", "content": [{"type": "text", "text": prompt}]}]
89
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
90
+ print(prompt)
91
+
92
+ flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
93
+ inputs = processor(text=prompt, images=flatten_images, return_tensors="pt")
94
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
95
+
96
+ outputs = model.generate(**inputs, max_new_tokens=1024)
97
+ generated_text = processor.decode(outputs[0, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
98
+ return generated_text
99
+
100
+ def read_video_pyav(container, indices):
101
+ '''
102
+ Decode the video with PyAV decoder.
103
+
104
+ Args:
105
+ container (av.container.input.InputContainer): PyAV container.
106
+ indices (List[int]): List of frame indices to decode.
107
+
108
+ Returns:
109
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
110
+ '''
111
+ frames = []
112
+ container.seek(0)
113
+ start_index = indices[0]
114
+ end_index = indices[-1]
115
+ for i, frame in enumerate(container.decode(video=0)):
116
+ if i > end_index:
117
+ break
118
+ if i >= start_index and i in indices:
119
+ frames.append(frame)
120
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
121
+
122
+ def eval_video(prompt, video:str):
123
+ container = av.open(video)
124
+
125
+ # sample uniformly 8 frames from the video
126
+ total_frames = container.streams.video[0].frames
127
+ if total_frames > MAX_NUM_FRAMES:
128
+ indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
129
+ else:
130
+ indices = np.arange(total_frames)
131
+ video_frames = read_video_pyav(container, indices)
132
+
133
+ frames = [Image.fromarray(x) for x in video_frames]
134
+
135
+ eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
136
+
137
+ num_image_token = eval_prompt.count("<image>")
138
+ if num_image_token < len(frames):
139
+ eval_prompt += "<image> " * (len(frames) - num_image_token)
140
+
141
+ aspect_scores = score(eval_prompt, [frames])
142
+ return aspect_scores
143
+
144
+ def build_demo():
145
+ with gr.Blocks() as demo:
146
+ gr.Markdown("""
147
+ ## Video Evaluation
148
+ upload a video along with a text prompt when generating the video, this model will evaluate the video's quality from 7 different dimensions.
149
+ """)
150
+ with gr.Row():
151
+ video = gr.Video(width=500, label="Video")
152
+ with gr.Column():
153
+ eval_prompt_template = gr.Textbox(VIDEO_EVAL_PROMPT.strip(' \n'), label="Evaluation Prompt Template", interactive=False, max_lines=26)
154
+ video_prompt = gr.Textbox(label="Text Prompt", lines=1)
155
+ with gr.Row():
156
+ eval_button = gr.Button("Evaluate Video")
157
+ clear_button = gr.ClearButton([video, video_prompt])
158
+ eval_result = gr.Textbox(label="Evaluation result", interactive=False, lines=7)
159
+ # eval_result = gr.Json(label="Evaluation result")
160
+
161
+
162
+ eval_button.click(
163
+ eval_video, [video_prompt, video], [eval_result]
164
+ )
165
+
166
+ dummy_id = gr.Textbox("id", label="id", visible=False, min_width=50)
167
+ dummy_output = gr.Textbox("reference score", label="reference scores", visible=False, lines=7)
168
+
169
+ gr.Examples(
170
+ examples=
171
+ [
172
+ [
173
+ item['id'],
174
+ item['prompt'],
175
+ item['video'],
176
+ item['conversations'][1]['value']
177
+ ] for item in examples
178
+ ],
179
+ inputs=[dummy_id, video_prompt, video, dummy_output],
180
+ )
181
+
182
+ # gr.Markdown("""
183
+ # ## Citation
184
+ # ```
185
+ # @article{jiang2024mantis,
186
+ # title={MANTIS: Interleaved Multi-Image Instruction Tuning},
187
+ # author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
188
+ # journal={arXiv preprint arXiv:2405.01483},
189
+ # year={2024}
190
+ # }
191
+ # ```""")
192
+ return demo
193
+
194
+
195
+ if __name__ == "__main__":
196
+ demo = build_demo()
197
+ demo.launch(share=True)
app_high_res.py CHANGED
@@ -13,8 +13,8 @@ from models.conversation import conv_templates
13
  from typing import List
14
 
15
 
16
- processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-high-res-40k-mantis-2epoch_4096")
17
- model = Idefics2ForConditionalGeneration.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-high-res-40k-mantis-2epoch_4096", torch_dtype=torch.bfloat16)
18
  MAX_NUM_FRAMES = 24
19
  conv_template = conv_templates["idefics_2"]
20
 
@@ -35,29 +35,27 @@ for item in hd_examples:
35
  examples = hd_examples + examples
36
 
37
  VIDEO_EVAL_PROMPT = """
38
- Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
39
- please watch the following frames of a given video and see the text prompt for generating the video,
40
- then give scores from 7 different dimensions:
41
  (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
42
- (2) object consistency, the consistency of objects or humans in video
43
  (3) dynamic degree, the degree of dynamic changes
44
- (4) motion smoothness, the smoothness of motion or movements
45
- (5) text-to-video alignment, the alignment between the text prompt and the video content
46
- (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
47
- (7) overall score, the overall quality of the video
48
- for each dimension, output a number from [1,2,3,4],
49
- in which '1' is 'Bad', '2' is 'Average', '3' is 'Good', '4' is 'Perfect'
50
  Here is an output example:
51
- visual quality: 3
52
- object consistency: 4
53
- dynamic degree: 4
54
- motion smoothness: 1
55
  text-to-video alignment: 1
56
  factual consistency: 2
57
- overall score: 1
58
 
59
  For this video, the text prompt is "{text_prompt}",
60
- all the frames of video are as follows:
61
 
62
  """
63
  @spaces.GPU(duration=60)
 
13
  from typing import List
14
 
15
 
16
+ processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-refined-40k_4096_generation")
17
+ model = Idefics2ForConditionalGeneration.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-refined-40k_4096_generation", device_map="auto", torch_dtype=torch.bfloat16).eval()
18
  MAX_NUM_FRAMES = 24
19
  conv_template = conv_templates["idefics_2"]
20
 
 
35
  examples = hd_examples + examples
36
 
37
  VIDEO_EVAL_PROMPT = """
38
+ Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
39
+ please watch the following frames of a given video and see the text prompt for generating the video,
40
+ then give scores from 5 different dimensions:
41
  (1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
42
+ (2) temporal consistency, the consistency of objects or humans in video
43
  (3) dynamic degree, the degree of dynamic changes
44
+ (4) text-to-video alignment, the alignment between the text prompt and the video content
45
+ (5) factual consistency, the consistency of the video content with the common-sense and factual knowledge
46
+
47
+ For each dimension, output a number from [1,2,3,4],
48
+ in which '1' means 'Bad', '2' means 'Average', '3' means 'Good',
49
+ '4' means 'Real' or 'Perfect' (the video is like a real video)
50
  Here is an output example:
51
+ visual quality: 4
52
+ temporal consistency: 4
53
+ dynamic degree: 3
 
54
  text-to-video alignment: 1
55
  factual consistency: 2
 
56
 
57
  For this video, the text prompt is "{text_prompt}",
58
+ all the frames of video are as follows:
59
 
60
  """
61
  @spaces.GPU(duration=60)