VideoScore

Sleeping

App Files Files Community

DongfuJiang commited on 23 days ago

Commit

0a842a1

•

1 Parent(s): e1fb4c8

update

Browse files

Files changed (4) hide show

app_regression.py +194 -0
models/__init__.py +0 -0
models/idefics2/__init__.py +1 -0
models/idefics2/modeling_idefics2.py +0 -0

app_regression.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import gradio as gr
+import spaces
+import os
+import time
+import json
+import numpy as np
+import av
+import torch
+from PIL import Image
+import functools
+from transformers import AutoProcessor, AutoConfig
+from models.idefics2 import Idefics2ForSequenceClassification
+from models.conversation import conv_templates
+from typing import List
+processor = AutoProcessor.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final")
+model = Idefics2ForSequenceClassification.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final", torch_dtype=torch.bfloat16)
+model.eval()
+MAX_NUM_FRAMES = 24
+conv_template = conv_templates["idefics_2"]
+with open("./examples/all_subsets.json", 'r') as f:
+    examples = json.load(f)
+for item in examples:
+    video_id = item['images'][0].split("_")[0]
+    item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']]
+    item['video'] = os.path.join("./examples", item['video'])
+with open("./examples/hd.json", 'r') as f:
+    hd_examples = json.load(f)
+for item in hd_examples:
+    item['video'] = os.path.join("./examples", item['video'])
+examples = hd_examples + examples
+VIDEO_EVAL_PROMPT = """
+Suppose you are an expert in judging and evaluating the quality of AI-generated videos,
+please watch the following frames of a given video and see the text prompt for generating the video,
+then give scores from 7 different dimensions:
+(1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color
+(2) object consistency, the consistency of objects or humans in video
+(3) dynamic degree, the degree of dynamic changes
+(4) motion smoothness, the smoothness of motion or movements
+(5) text-to-video alignment, the alignment between the text prompt and the video content
+(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
+(7) overall score, the overall quality of the video
+for each dimension, output a number from [1,2,3,4],
+in which '1' is 'Bad', '2' is 'Average', '3' is 'Good', '4' is 'Perfect'
+Here is an output example:
+visual quality: 3
+object consistency: 4
+dynamic degree: 4
+motion smoothness: 1
+text-to-video alignment: 1
+factual consistency: 2
+overall score: 1
+For this video, the text prompt is "{text_prompt}",
+all the frames of video are as follows:
+"""
+@spaces.GPU(duration=60)
+def score(prompt:str, images:List[Image.Image]):
+    if not prompt:
+        raise gr.Error("Please provide a prompt")
+    model.to("cuda")
+    if not images:
+        images = None
+    flatten_images = []
+    for x in images:
+        if isinstance(x, list):
+            flatten_images.extend(x)
+        else:
+            flatten_images.append(x)
+    flatten_images = [Image.open(x) if isinstance(x, str) else x for x in flatten_images]
+    inputs = processor(text=prompt, images=flatten_images, return_tensors="pt")
+    print(processor.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=False))
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    outputs = model(**inputs)
+    logits = outputs.logits
+    num_aspects = logits.shape[-1]
+    aspects = [f"aspect_{i}" for i in range(num_aspects)]
+    aspect_scores = {}
+    for i, aspect in enumerate(aspects):
+        aspect_scores[aspect] = logits[0, i].item()
+    return aspect_scores
+def read_video_pyav(container, indices):
+    '''
+    Decode the video with PyAV decoder.
+    Args:
+        container (av.container.input.InputContainer): PyAV container.
+        indices (List[int]): List of frame indices to decode.
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    '''
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+def eval_video(prompt, video:str):
+    container = av.open(video)
+    # sample uniformly 8 frames from the video
+    total_frames = container.streams.video[0].frames
+    if total_frames > MAX_NUM_FRAMES:
+        indices = np.arange(0, total_frames, total_frames / MAX_NUM_FRAMES).astype(int)
+    else:
+        indices = np.arange(total_frames)
+    video_frames = read_video_pyav(container, indices)
+    frames = [Image.fromarray(x) for x in video_frames]
+    eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
+    num_image_token = eval_prompt.count("<image>")
+    if num_image_token < len(frames):
+        eval_prompt += "<image> " * (len(frames) - num_image_token)
+    aspect_scores = score(eval_prompt, [frames])
+    return aspect_scores
+def build_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("""
+            ## Video Evaluation
+            upload a video along with a text prompt when generating the video, this model will evaluate the video's quality from 7 different dimensions.
+        """)
+        with gr.Row():
+            video = gr.Video(width=500, label="Video")
+            with gr.Column():
+                eval_prompt_template = gr.Textbox(VIDEO_EVAL_PROMPT.strip(' \n'), label="Evaluation Prompt Template", interactive=False, max_lines=26)
+                video_prompt = gr.Textbox(label="Text Prompt", lines=1)
+                with gr.Row():
+                    eval_button = gr.Button("Evaluate Video")
+                    clear_button = gr.ClearButton([video, video_prompt])
+                # eval_result = gr.Textbox(label="Evaluation result", interactive=False, lines=7)
+                eval_result = gr.Json(label="Evaluation result")
+        eval_button.click(
+            eval_video, [video_prompt, video], [eval_result]
+        )
+        dummy_id = gr.Textbox("id", label="id", visible=False, min_width=50)
+        dummy_output = gr.Textbox("reference score", label="reference scores", visible=False, lines=7)
+        gr.Examples(
+            examples=
+            [
+                [
+                    item['id'],
+                    item['prompt'],
+                    item['video'],
+                    item['conversations'][1]['value']
+                ] for item in examples
+            ],
+            inputs=[dummy_id, video_prompt, video, dummy_output],
+        )
+#         gr.Markdown("""
+# ## Citation
+# ```
+# @article{jiang2024mantis,
+#   title={MANTIS: Interleaved Multi-Image Instruction Tuning},
+#   author={Jiang, Dongfu and He, Xuan and Zeng, Huaye and Wei, Con and Ku, Max and Liu, Qian and Chen, Wenhu},
+#   journal={arXiv preprint arXiv:2405.01483},
+#   year={2024}
+# }
+# ```""")
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch(share=True)

models/__init__.py ADDED Viewed

File without changes

models/idefics2/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_idefics2 import Idefics2ForConditionalGeneration, Idefics2ForSequenceClassification

models/idefics2/modeling_idefics2.py ADDED Viewed

The diff for this file is too large to render. See raw diff