Spaces:

TIGER-Lab
/

VideoScore2

Running on Zero

App Files Files Community

hexuan21 commited on 18 days ago

Commit

120ff1b

1 Parent(s): 6811eb1

add gradio app

Browse files

Files changed (11) hide show

.gitattributes +1 -0
app.py +126 -0
eval_methods/vs2_float.py +214 -0
examples/000149_r.mp4 +3 -0
examples/001990_d.mp4 +3 -0
examples/002242_j.mp4 +3 -0
examples/002630_g.mp4 +3 -0
examples/003020_f.mp4 +3 -0
examples/003690_a.mp4 +3 -0
examples/examples.json +26 -0
requirements.txt +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import gradio as gr
+import spaces
+import torch
+from string import Template
+from eval_methods.vs2_float import eval_VideoScore2_float
+import json
+# ----------------------------
+# Constants and Model Init
+# ----------------------------
+MODEL_NAME = "TIGER-Lab/VideoScore2"
+vs2_evaluator = eval_VideoScore2_float(MODEL_NAME)
+VS2_QUERY_TEMPLATE = Template("""
+You are an expert for evaluating and thinking about the quality of AI videos from diverse dimensions.
+We would like to evaluate its quality from three dimensions: 'visual quality', 'text-to-video alignment' and 'physical/common-sense consistency'. Below is the definition of each dimension:
+(1) visual quality:
+The dimension 'visual quality' cares about the video's visual and optical propertities, including resolution, overall clarity, local blurriness, smoothness, stability of brightness/contrast, distortion/misalignment, abrupt changes, and any other factors that affect the watching experience.
+(2) text-to-video alignment:
+The dimension 't2v_alignment' mainly assesses whether the generated video fully and accurately depicts the elements mentioned in the text prompt, such as characters, actions, animals, etc., as well as background, quantity, color, weather, and so on.
+(3) physical/common-sense consistency:
+The dimension 'physical/common-sense consistency' mainly examines whether there are any violations of common sense, physical laws, or any other aspects in the video that appear strange or unnatural.
+Here we provide an AI video generated by text-to-video models and its text prompt:
+$t2v_prompt.
+Based on the video content and the dimension definitions, please evaluate the video and give the quality score.
+The quality score must be integers in the range of 1 - 5.
+Your output must be in the following format:
+visual quality: <v_score>;
+text-to-video alignment: <t_score>;
+physical/common-sense consistency: <p_score>
+DO NOT include any other things behind or after your output.
+""")
+space_description = """
+[📃Paper](https://www.arxiv.org/abs/2509.22799) | [🌐Website](https://tiger-ai-lab.github.io/VideoScore2/) | [💻GitHub](https://github.com/TIGER-AI-Lab/VideoScore2) | [🛢️Dataset](https://huggingface.co/datasets/TIGER-Lab/VideoFeedback2) | [🤗Model](https://huggingface.co/TIGER-Lab/VideoScore2)
+**VideoScore2** is a next-generation, interpretable and multi-dimensional video evaluation model designed to align with human judgment on text-to-video generation tasks.
+It explicitly evaluates **visual quality**, **text-to-video alignment**, and **physical/common-sense consistency**, producing structured scores and reasoning.
+"""
+with open("./examples/examples.json", 'r') as f:
+    examples = json.load(f)
+# ----------------------------
+# Evaluation Core
+# ----------------------------
+@spaces.GPU(duration=60)
+def eval_vs2(video_path, t2v_prompt):
+    if not video_path:
+        raise gr.Error("Please upload a video.")
+    if not t2v_prompt:
+        raise gr.Error("Please provide a text prompt.")
+    user_prompt = VS2_QUERY_TEMPLATE.substitute(t2v_prompt=t2v_prompt)
+    method_kwargs = {
+        "max_tokens": 1024,
+        "infer_fps": 2.0
+    }
+    with torch.no_grad():
+        v_score, t_score, p_score, full_text = vs2_evaluator.evaluate_video(
+            user_prompt=user_prompt,
+            video_path=video_path,
+            kwargs=method_kwargs
+        )
+    return {
+        "visual quality": v_score,
+        "text-to-video alignment": t_score,
+        "physical/common-sense consistency": p_score,
+        "full analysis": full_text
+    }
+# ----------------------------
+# Build Gradio Demo
+# ----------------------------
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("## VideoScore2: Think before You Score in Generative Video Evaluation")
+        gr.Markdown(space_description)
+        gr.Image("https://tiger-ai-lab.github.io/VideoScore2/static/images/teaser.png", label="Teaser")
+        with gr.Row():
+            video_input = gr.Video(label="Upload your video", width=500)
+            with gr.Column():
+                t2v_prompt = gr.Textbox(label="Text-to-Video Prompt", placeholder="Describe the video prompt...")
+                eval_btn = gr.Button("Evaluate Video", variant="primary")
+                result_box = gr.Json(label="Evaluation Result")
+        eval_btn.click(fn=eval_vs2, inputs=[video_input, t2v_prompt], outputs=[result_box])
+        gr.Examples(
+            examples=
+            [
+                [
+                    item['video'],
+                    item['prompt'],
+                ] for item in examples if item['prompt']
+            ],
+            inputs=[video_input,t2v_prompt],
+        )
+        gr.Markdown("""
+        ### 📚 Citation
+        @misc{he2025videoscore2thinkscoregenerative,
+            title={VideoScore2: Think before You Score in Generative Video Evaluation},
+            author={Xuan He and Dongfu Jiang and Ping Nie and Minghao Liu and Zhengxuan Jiang and Mingyi Su and Wentao Ma and Junru Lin and Chun Ye and Yi Lu and Keming Wu and Benjamin Schneider and Quy Duc Do and Zhuofeng Li and Yiming Jia and Yuxuan Zhang and Guo Cheng and Haozhe Wang and Wangchunshu Zhou and Qunshu Lin and Yuanxing Zhang and Ge Zhang and Wenhao Huang and Wenhu Chen},
+            year={2025},
+            eprint={2509.22799},
+            archivePrefix={arXiv},
+            primaryClass={cs.CV},
+            url={https://arxiv.org/abs/2509.22799},
+        }""")
+    return demo
+# ----------------------------
+# Main
+# ----------------------------
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch(share=True)

eval_methods/vs2_float.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from transformers import AutoProcessor, AutoModelForVision2Seq, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+import torch
+import numpy as np
+import cv2, os, re
+def _get_video_fps(url_or_p:str):
+    cap = cv2.VideoCapture(url_or_p)
+    if not cap.isOpened():
+        raise ValueError(f"Cannot open video: {url_or_p}")
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    cap.release()
+    return fps
+class eval_VideoScore2_float:
+    def __init__(self, model_name: str):
+        self.model, self.processor = self.load_model_processor(model_name)
+        self.tokenizer = getattr(self.processor, "tokenizer", None)
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                use_fast=False,
+            )
+    def load_model_processor(self, model_name):
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        ).to("cuda")
+        processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+        return model, processor
+    def evaluate_video(self,
+            user_prompt: str,
+            video_path: str,
+            kwargs: dict
+        ) -> str | None:
+        if not os.path.exists(video_path):
+            raise ValueError(f"not exist: {video_path}")
+        max_tokens=kwargs.get("max_tokens",4096)
+        infer_fps=kwargs.get("infer_fps",2.0)
+        temperature=kwargs.get("temperature",0.7)
+        if infer_fps == "raw":
+            infer_fps=_get_video_fps(video_path)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        "fps":infer_fps
+                    },
+                    {
+                        "type": "text",
+                        "text": user_prompt,
+                    },
+                ],
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        try:
+            image_inputs, video_inputs = process_vision_info(messages)
+        except Exception as e:
+            raise ValueError(f"error when reading: {video_path}")
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            fps=infer_fps,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        gen_out = self.model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            output_scores=True,
+            return_dict_in_generate=True,
+            do_sample=True,
+            temperature=temperature,
+        )
+        sequences = gen_out.sequences
+        scores = gen_out.scores
+        input_len = inputs["input_ids"].shape[1]
+        gen_token_ids = sequences[0, input_len:].tolist()
+        output_text = self.processor.batch_decode(
+            sequences[:, input_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        pattern = r"visual quality:\s*(\d+).*?text-to-video alignment:\s*(\d+).*?physical/common-sense consistency:\s*(\d+)"
+        match = re.search(pattern, output_text, re.DOTALL | re.IGNORECASE)
+        if match:
+            v_score_model = int(match.group(1))
+            t_score_model = int(match.group(2))
+            p_score_model = int(match.group(3))
+        else:
+            v_score_model = t_score_model = p_score_model = None
+        # def find_score_token_index_by_prompt(prompt_text: str) -> int:
+        #     prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
+        #     gen_ids = gen_token_ids
+        #     print("Prompt tokens:", prompt_tokens, self.tokenizer.decode(prompt_tokens))
+        #     print("Generated tokens snippet:", gen_ids[:50], self.tokenizer.decode(gen_ids[:50]))
+        #     for i in range(len(gen_ids) - len(prompt_tokens)):
+        #         if gen_ids[i:i+len(prompt_tokens)] == prompt_tokens:
+        #             j = i + len(prompt_tokens)
+        #             while j < len(gen_ids):
+        #                 token_str = self.tokenizer.decode([gen_ids[j]]).strip()
+        #                 if token_str.isdigit():
+        #                     return j
+        #                 j += 1
+        #     return -1
+        def find_score_token_index_by_prompt_v0(prompt_text: str) -> int:
+            prompt_tokens = self.tokenizer.encode(prompt_text, add_special_tokens=False)
+            gen_ids = gen_token_ids
+            for i in range(len(gen_ids) - len(prompt_tokens)):
+                if gen_ids[i:i+len(prompt_tokens)] == prompt_tokens:
+                    j = i + len(prompt_tokens)
+                    while j < len(gen_ids):
+                        token_str = self.tokenizer.decode([gen_ids[j]], skip_special_tokens=True).strip()
+                        if token_str.isdigit():
+                            return j
+                        j += 1
+            return -1
+        def find_score_token_index_by_prompt(prompt_text: str):
+            import re
+            gen_ids = gen_token_ids
+            gen_str = self.tokenizer.decode(gen_ids, skip_special_tokens=False)
+            pattern = r"(?:\(\d+\)\s*|\n\s*)?" + re.escape(prompt_text)
+            match = re.search(pattern, gen_str, flags=re.IGNORECASE)
+            if not match:
+                return -1
+            after_text = gen_str[match.end():]
+            num_match = re.search(r"\d", after_text)
+            if not num_match:
+                return -1
+            target_substr = gen_str[:match.end() + num_match.start() + 1]
+            for i in range(len(gen_ids)):
+                partial = self.tokenizer.decode(gen_ids[:i+1], skip_special_tokens=False)
+                if partial == target_substr:
+                    return i
+            return -1
+        idx_v = find_score_token_index_by_prompt("visual quality:")
+        idx_t = find_score_token_index_by_prompt("text-to-video alignment:")
+        idx_p = find_score_token_index_by_prompt("physical/common-sense consistency:")
+        def ll_based_soft_score_normed(hard_val, token_idx) -> float:
+            if hard_val is None or token_idx < 0:
+                return None
+            logits = scores[token_idx][0]  # [vocab]
+            score_range = list(range(1, 6))
+            score_probs = []  # [(score, prob)]
+            for s in score_range:
+                ids = self.tokenizer.encode(str(s), add_special_tokens=False)
+                if len(ids) == 1:
+                    tid = ids[0]
+                    logp = torch.log_softmax(logits, dim=-1)[tid].item()
+                    prob = float(np.exp(logp))
+                    score_probs.append((s, prob))
+                else:
+                    print(f"[warn] score {s} maps to multi-token: {ids}, skipping.")
+            if not score_probs:
+                print("[warn] No valid score token found (1–5 all multi-token?)")
+                return None
+            scores_list, probs_list = zip(*score_probs)
+            total_prob = sum(probs_list)
+            max_prob = max(probs_list)
+            max_idx = probs_list.index(max_prob)
+            best_score = scores_list[max_idx]
+            normalized_prob = max_prob / total_prob if total_prob > 0 else 0
+            soft_score = best_score * normalized_prob
+            print(f"hard score={hard_val}, token_idx={token_idx}")
+            for s, p in score_probs:
+                print(f"  score {s}: prob={p:.4f}")
+            print(f"  max prob={max_prob:.4f} at score={best_score}, total prob={total_prob:.4f}")
+            print(f"  normalized prob={normalized_prob:.4f}, soft score={soft_score:.4f}")
+            return round(soft_score,4)
+        v_soft = ll_based_soft_score_normed(v_score_model, idx_v)
+        t_soft = ll_based_soft_score_normed(t_score_model, idx_t)
+        p_soft = ll_based_soft_score_normed(p_score_model, idx_p)
+        return v_soft, t_soft, p_soft, output_text

examples/000149_r.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72fcbb5de5e9d7a617907432d1ed4f65cb3af12ce901db4e6863a2c303638a46
+size 2508141

examples/001990_d.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:795319e57bc01ba23437f87760abf9d145c49cb84e7f60fb7fde5e192a7bfdaa
+size 192358

examples/002242_j.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f56519938a278d93bdd1e8bf475d33f5bcbd8faba06d3a2f1ff2277a93634070
+size 115541

examples/002630_g.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3babfa3da068337dceea73ff7a3de8361a19c5350a580d9d099f7e931cc8d4ba
+size 1399187

examples/003020_f.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b553fcd2d1dd35c6f829e99342a4753f6da0572e59fb07284a52804faa02a6c
+size 403730

examples/003690_a.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b3c94355d0fae60be39211bdae3b176a130de318b2f887d2313f7cfe7071811
+size 92361

examples/examples.json ADDED Viewed

	@@ -0,0 +1,26 @@

+[
+    {
+        "video": "000149_r.mp4",
+        "prompt": "A young blond man sitting on a bench, holding a fishing rod, with the fishing line out. Wide-angle, high resolution"
+    },
+    {
+        "video": "002242_j.mp4",
+        "prompt": "David walking into the hospital, with walls creaking with every step he takes. The lights flickering ominously and the murmurs of patients filtering in from the shadows. Pan right"
+    },
+    {
+        "video": "003020_f.mp4",
+        "prompt": "The vegan recipe adventure: An animated character embarks on a culinary journey to discover unique and delicious vegan recipes from around the world, showcasing the diversity of vegan cuisine"
+    },
+    {
+        "video": "003690_a.mp4",
+        "prompt": "An old woman is knitting on a rocking wooden chair and listening to the radio. A cat is next to her. Pan right"
+    },
+    {
+        "video": "002630_g.mp4",
+        "prompt": "A group of people gathers around a brick barbecue pit, where a whole pig is being roasted. The pig is surrounded by a metal grate and covered with charcoal, while people casually chat and enjoy the outdoor setting. Some stand, others sit on chairs, all focused on the pig being roasted. The scene is captured at eye level, offering a clear, static view of the event"
+    },
+    {
+        "video": "001990_d.mp4",
+        "prompt": "A picturesque sunrise over a quiet town, with a gold-haired boy in a red shirt standing on a hill overlooking the town"
+    }
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+accelerate
+datasets==2.19.2
+gdown
+gradio
+opencv-python-headless
+pandas
+pyarrow
+qwen-vl-utils
+scipy
+spaces
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
+transformers==4.53.2