hexuan21 commited on
Commit
120ff1b
·
1 Parent(s): 6811eb1

add gradio app

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ from string import Template
5
+ from eval_methods.vs2_float import eval_VideoScore2_float
6
+ import json
7
+
8
+ # ----------------------------
9
+ # Constants and Model Init
10
+ # ----------------------------
11
+ MODEL_NAME = "TIGER-Lab/VideoScore2"
12
+
13
+ vs2_evaluator = eval_VideoScore2_float(MODEL_NAME)
14
+ VS2_QUERY_TEMPLATE = Template("""
15
+ You are an expert for evaluating and thinking about the quality of AI videos from diverse dimensions.
16
+
17
+ We would like to evaluate its quality from three dimensions: 'visual quality', 'text-to-video alignment' and 'physical/common-sense consistency'. Below is the definition of each dimension:
18
+ (1) visual quality:
19
+ The dimension 'visual quality' cares about the video's visual and optical propertities, including resolution, overall clarity, local blurriness, smoothness, stability of brightness/contrast, distortion/misalignment, abrupt changes, and any other factors that affect the watching experience.
20
+ (2) text-to-video alignment:
21
+ The dimension 't2v_alignment' mainly assesses whether the generated video fully and accurately depicts the elements mentioned in the text prompt, such as characters, actions, animals, etc., as well as background, quantity, color, weather, and so on.
22
+ (3) physical/common-sense consistency:
23
+ The dimension 'physical/common-sense consistency' mainly examines whether there are any violations of common sense, physical laws, or any other aspects in the video that appear strange or unnatural.
24
+
25
+ Here we provide an AI video generated by text-to-video models and its text prompt:
26
+ $t2v_prompt.
27
+
28
+ Based on the video content and the dimension definitions, please evaluate the video and give the quality score.
29
+ The quality score must be integers in the range of 1 - 5.
30
+
31
+ Your output must be in the following format:
32
+ visual quality: <v_score>;
33
+ text-to-video alignment: <t_score>;
34
+ physical/common-sense consistency: <p_score>
35
+
36
+ DO NOT include any other things behind or after your output.
37
+ """)
38
+
39
+ space_description = """
40
+ [📃Paper](https://www.arxiv.org/abs/2509.22799) | [🌐Website](https://tiger-ai-lab.github.io/VideoScore2/) | [💻GitHub](https://github.com/TIGER-AI-Lab/VideoScore2) | [🛢️Dataset](https://huggingface.co/datasets/TIGER-Lab/VideoFeedback2) | [🤗Model](https://huggingface.co/TIGER-Lab/VideoScore2)
41
+
42
+ **VideoScore2** is a next-generation, interpretable and multi-dimensional video evaluation model designed to align with human judgment on text-to-video generation tasks.
43
+ It explicitly evaluates **visual quality**, **text-to-video alignment**, and **physical/common-sense consistency**, producing structured scores and reasoning.
44
+ """
45
+
46
+ with open("./examples/examples.json", 'r') as f:
47
+ examples = json.load(f)
48
+
49
+ # ----------------------------
50
+ # Evaluation Core
51
+ # ----------------------------
52
+ @spaces.GPU(duration=60)
53
+ def eval_vs2(video_path, t2v_prompt):
54
+ if not video_path:
55
+ raise gr.Error("Please upload a video.")
56
+ if not t2v_prompt:
57
+ raise gr.Error("Please provide a text prompt.")
58
+
59
+ user_prompt = VS2_QUERY_TEMPLATE.substitute(t2v_prompt=t2v_prompt)
60
+ method_kwargs = {
61
+ "max_tokens": 1024,
62
+ "infer_fps": 2.0
63
+ }
64
+
65
+ with torch.no_grad():
66
+ v_score, t_score, p_score, full_text = vs2_evaluator.evaluate_video(
67
+ user_prompt=user_prompt,
68
+ video_path=video_path,
69
+ kwargs=method_kwargs
70
+ )
71
+
72
+ return {
73
+ "visual quality": v_score,
74
+ "text-to-video alignment": t_score,
75
+ "physical/common-sense consistency": p_score,
76
+ "full analysis": full_text
77
+ }
78
+
79
+ # ----------------------------
80
+ # Build Gradio Demo
81
+ # ----------------------------
82
+ def build_demo():
83
+ with gr.Blocks() as demo:
84
+ gr.Markdown("## VideoScore2: Think before You Score in Generative Video Evaluation")
85
+ gr.Markdown(space_description)
86
+ gr.Image("https://tiger-ai-lab.github.io/VideoScore2/static/images/teaser.png", label="Teaser")
87
+
88
+ with gr.Row():
89
+ video_input = gr.Video(label="Upload your video", width=500)
90
+ with gr.Column():
91
+ t2v_prompt = gr.Textbox(label="Text-to-Video Prompt", placeholder="Describe the video prompt...")
92
+ eval_btn = gr.Button("Evaluate Video", variant="primary")
93
+ result_box = gr.Json(label="Evaluation Result")
94
+
95
+ eval_btn.click(fn=eval_vs2, inputs=[video_input, t2v_prompt], outputs=[result_box])
96
+
97
+ gr.Examples(
98
+ examples=
99
+ [
100
+ [
101
+ item['video'],
102
+ item['prompt'],
103
+ ] for item in examples if item['prompt']
104
+ ],
105
+ inputs=[video_input,t2v_prompt],
106
+ )
107
+
108
+ gr.Markdown("""
109
+ ### 📚 Citation
110
+ @misc{he2025videoscore2thinkscoregenerative,
111
+ title={VideoScore2: Think before You Score in Generative Video Evaluation},
112
+ author={Xuan He and Dongfu Jiang and Ping Nie and Minghao Liu and Zhengxuan Jiang and Mingyi Su and Wentao Ma and Junru Lin and Chun Ye and Yi Lu and Keming Wu and Benjamin Schneider and Quy Duc Do and Zhuofeng Li and Yiming Jia and Yuxuan Zhang and Guo Cheng and Haozhe Wang and Wangchunshu Zhou and Qunshu Lin and Yuanxing Zhang and Ge Zhang and Wenhao Huang and Wenhu Chen},
113
+ year={2025},
114
+ eprint={2509.22799},
115
+ archivePrefix={arXiv},
116
+ primaryClass={cs.CV},
117
+ url={https://arxiv.org/abs/2509.22799},
118
+ }""")
119
+ return demo
120
+
121
+ # ----------------------------
122
+ # Main
123
+ # ----------------------------
124
+ if __name__ == "__main__":
125
+ demo = build_demo()
126
+ demo.launch(share=True)
eval_methods/vs2_float.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, AutoModelForVision2Seq, AutoTokenizer
2
+ from qwen_vl_utils import process_vision_info
3
+ import torch
4
+ import numpy as np
5
+ import cv2, os, re
6
+
7
+ def _get_video_fps(url_or_p:str):
8
+ cap = cv2.VideoCapture(url_or_p)
9
+ if not cap.isOpened():
10
+ raise ValueError(f"Cannot open video: {url_or_p}")
11
+
12
+ fps = cap.get(cv2.CAP_PROP_FPS)
13
+ cap.release()
14
+ return fps
15
+
16
+ class eval_VideoScore2_float:
17
+ def __init__(self, model_name: str):
18
+ self.model, self.processor = self.load_model_processor(model_name)
19
+
20
+ self.tokenizer = getattr(self.processor, "tokenizer", None)
21
+ if self.tokenizer is None:
22
+ self.tokenizer = AutoTokenizer.from_pretrained(
23
+ model_name,
24
+ trust_remote_code=True,
25
+ use_fast=False,
26
+ )
27
+
28
+ def load_model_processor(self, model_name):
29
+ model = AutoModelForVision2Seq.from_pretrained(
30
+ model_name,
31
+ trust_remote_code=True,
32
+ ).to("cuda")
33
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
34
+ return model, processor
35
+
36
+
37
+ def evaluate_video(self,
38
+ user_prompt: str,
39
+ video_path: str,
40
+ kwargs: dict
41
+ ) -> str | None:
42
+ if not os.path.exists(video_path):
43
+ raise ValueError(f"not exist: {video_path}")
44
+ max_tokens=kwargs.get("max_tokens",4096)
45
+ infer_fps=kwargs.get("infer_fps",2.0)
46
+ temperature=kwargs.get("temperature",0.7)
47
+ if infer_fps == "raw":
48
+ infer_fps=_get_video_fps(video_path)
49
+
50
+ messages = [
51
+ {
52
+ "role": "user",
53
+ "content": [
54
+ {
55
+ "type": "video",
56
+ "video": video_path,
57
+ "fps":infer_fps
58
+ },
59
+ {
60
+ "type": "text",
61
+ "text": user_prompt,
62
+ },
63
+ ],
64
+ }
65
+ ]
66
+
67
+ text = self.processor.apply_chat_template(
68
+ messages, tokenize=False, add_generation_prompt=True
69
+ )
70
+ try:
71
+ image_inputs, video_inputs = process_vision_info(messages)
72
+ except Exception as e:
73
+ raise ValueError(f"error when reading: {video_path}")
74
+
75
+ inputs = self.processor(
76
+ text=[text],
77
+ images=image_inputs,
78
+ videos=video_inputs,
79
+ fps=infer_fps,
80
+ padding=True,
81
+ return_tensors="pt",
82
+ )
83
+ inputs = inputs.to("cuda")
84
+
85
+ gen_out = self.model.generate(
86
+ **inputs,
87
+ max_new_tokens=max_tokens,
88
+ output_scores=True,
89
+ return_dict_in_generate=True,
90
+ do_sample=True,
91
+ temperature=temperature,
92
+ )
93
+ sequences = gen_out.sequences
94
+ scores = gen_out.scores
95
+
96
+ input_len = inputs["input_ids"].shape[1]
97
+
98
+ gen_token_ids = sequences[0, input_len:].tolist()
99
+
100
+ output_text = self.processor.batch_decode(
101
+ sequences[:, input_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False
102
+ )[0]
103
+
104
+ pattern = r"visual quality:\s*(\d+).*?text-to-video alignment:\s*(\d+).*?physical/common-sense consistency:\s*(\d+)"
105
+ match = re.search(pattern, output_text, re.DOTALL | re.IGNORECASE)
106
+ if match:
107
+ v_score_model = int(match.group(1))
108
+ t_score_model = int(match.group(2))
109
+ p_score_model = int(match.group(3))
110
+ else:
111
+ v_score_model = t_score_model = p_score_model = None
112
+
113
+ # def find_score_token_index_by_prompt(prompt_text: str) -> int:
114
+ # prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
115
+ # gen_ids = gen_token_ids
116
+ # print("Prompt tokens:", prompt_tokens, self.tokenizer.decode(prompt_tokens))
117
+ # print("Generated tokens snippet:", gen_ids[:50], self.tokenizer.decode(gen_ids[:50]))
118
+ # for i in range(len(gen_ids) - len(prompt_tokens)):
119
+ # if gen_ids[i:i+len(prompt_tokens)] == prompt_tokens:
120
+ # j = i + len(prompt_tokens)
121
+ # while j < len(gen_ids):
122
+ # token_str = self.tokenizer.decode([gen_ids[j]]).strip()
123
+ # if token_str.isdigit():
124
+ # return j
125
+ # j += 1
126
+ # return -1
127
+
128
+ def find_score_token_index_by_prompt_v0(prompt_text: str) -> int:
129
+ prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
130
+ gen_ids = gen_token_ids
131
+
132
+ for i in range(len(gen_ids) - len(prompt_tokens)):
133
+ if gen_ids[i:i+len(prompt_tokens)] == prompt_tokens:
134
+ j = i + len(prompt_tokens)
135
+ while j < len(gen_ids):
136
+ token_str = self.tokenizer.decode([gen_ids[j]], skip_special_tokens=True).strip()
137
+ if token_str.isdigit():
138
+ return j
139
+ j += 1
140
+ return -1
141
+
142
+ def find_score_token_index_by_prompt(prompt_text: str):
143
+ import re
144
+ gen_ids = gen_token_ids
145
+ gen_str = self.tokenizer.decode(gen_ids, skip_special_tokens=False)
146
+
147
+ pattern = r"(?:\(\d+\)\s*|\n\s*)?" + re.escape(prompt_text)
148
+ match = re.search(pattern, gen_str, flags=re.IGNORECASE)
149
+ if not match:
150
+ return -1
151
+ after_text = gen_str[match.end():]
152
+ num_match = re.search(r"\d", after_text)
153
+ if not num_match:
154
+ return -1
155
+
156
+ target_substr = gen_str[:match.end() + num_match.start() + 1]
157
+
158
+ for i in range(len(gen_ids)):
159
+ partial = self.tokenizer.decode(gen_ids[:i+1], skip_special_tokens=False)
160
+ if partial == target_substr:
161
+ return i
162
+ return -1
163
+
164
+
165
+ idx_v = find_score_token_index_by_prompt("visual quality:")
166
+ idx_t = find_score_token_index_by_prompt("text-to-video alignment:")
167
+ idx_p = find_score_token_index_by_prompt("physical/common-sense consistency:")
168
+
169
+ def ll_based_soft_score_normed(hard_val, token_idx) -> float:
170
+ if hard_val is None or token_idx < 0:
171
+ return None
172
+ logits = scores[token_idx][0] # [vocab]
173
+ score_range = list(range(1, 6))
174
+ score_probs = [] # [(score, prob)]
175
+
176
+ for s in score_range:
177
+ ids = self.tokenizer.encode(str(s), add_special_tokens=False)
178
+ if len(ids) == 1:
179
+ tid = ids[0]
180
+ logp = torch.log_softmax(logits, dim=-1)[tid].item()
181
+ prob = float(np.exp(logp))
182
+ score_probs.append((s, prob))
183
+ else:
184
+ print(f"[warn] score {s} maps to multi-token: {ids}, skipping.")
185
+
186
+ if not score_probs:
187
+ print("[warn] No valid score token found (1–5 all multi-token?)")
188
+ return None
189
+
190
+ scores_list, probs_list = zip(*score_probs)
191
+ total_prob = sum(probs_list)
192
+ max_prob = max(probs_list)
193
+ max_idx = probs_list.index(max_prob)
194
+ best_score = scores_list[max_idx]
195
+
196
+ normalized_prob = max_prob / total_prob if total_prob > 0 else 0
197
+ soft_score = best_score * normalized_prob
198
+
199
+ print(f"hard score={hard_val}, token_idx={token_idx}")
200
+ for s, p in score_probs:
201
+ print(f" score {s}: prob={p:.4f}")
202
+ print(f" max prob={max_prob:.4f} at score={best_score}, total prob={total_prob:.4f}")
203
+ print(f" normalized prob={normalized_prob:.4f}, soft score={soft_score:.4f}")
204
+
205
+ return round(soft_score,4)
206
+
207
+
208
+ v_soft = ll_based_soft_score_normed(v_score_model, idx_v)
209
+ t_soft = ll_based_soft_score_normed(t_score_model, idx_t)
210
+ p_soft = ll_based_soft_score_normed(p_score_model, idx_p)
211
+
212
+ return v_soft, t_soft, p_soft, output_text
213
+
214
+
examples/000149_r.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72fcbb5de5e9d7a617907432d1ed4f65cb3af12ce901db4e6863a2c303638a46
3
+ size 2508141
examples/001990_d.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795319e57bc01ba23437f87760abf9d145c49cb84e7f60fb7fde5e192a7bfdaa
3
+ size 192358
examples/002242_j.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f56519938a278d93bdd1e8bf475d33f5bcbd8faba06d3a2f1ff2277a93634070
3
+ size 115541
examples/002630_g.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3babfa3da068337dceea73ff7a3de8361a19c5350a580d9d099f7e931cc8d4ba
3
+ size 1399187
examples/003020_f.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b553fcd2d1dd35c6f829e99342a4753f6da0572e59fb07284a52804faa02a6c
3
+ size 403730
examples/003690_a.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b3c94355d0fae60be39211bdae3b176a130de318b2f887d2313f7cfe7071811
3
+ size 92361
examples/examples.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "video": "000149_r.mp4",
4
+ "prompt": "A young blond man sitting on a bench, holding a fishing rod, with the fishing line out. Wide-angle, high resolution"
5
+ },
6
+ {
7
+ "video": "002242_j.mp4",
8
+ "prompt": "David walking into the hospital, with walls creaking with every step he takes. The lights flickering ominously and the murmurs of patients filtering in from the shadows. Pan right"
9
+ },
10
+ {
11
+ "video": "003020_f.mp4",
12
+ "prompt": "The vegan recipe adventure: An animated character embarks on a culinary journey to discover unique and delicious vegan recipes from around the world, showcasing the diversity of vegan cuisine"
13
+ },
14
+ {
15
+ "video": "003690_a.mp4",
16
+ "prompt": "An old woman is knitting on a rocking wooden chair and listening to the radio. A cat is next to her. Pan right"
17
+ },
18
+ {
19
+ "video": "002630_g.mp4",
20
+ "prompt": "A group of people gathers around a brick barbecue pit, where a whole pig is being roasted. The pig is surrounded by a metal grate and covered with charcoal, while people casually chat and enjoy the outdoor setting. Some stand, others sit on chairs, all focused on the pig being roasted. The scene is captured at eye level, offering a clear, static view of the event"
21
+ },
22
+ {
23
+ "video": "001990_d.mp4",
24
+ "prompt": "A picturesque sunrise over a quiet town, with a gold-haired boy in a red shirt standing on a hill overlooking the town"
25
+ }
26
+ ]
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate
2
+ datasets==2.19.2
3
+ gdown
4
+ gradio
5
+ opencv-python-headless
6
+ pandas
7
+ pyarrow
8
+ qwen-vl-utils
9
+ scipy
10
+ spaces
11
+ torch==2.6.0
12
+ torchaudio==2.6.0
13
+ torchvision==0.21.0
14
+ transformers==4.53.2