diff --git a/.gitignore b/.gitignore index 082c0a313cbf554b2d5be1e68f969abf3d53ac11..e0f3945d1c6a604a2e4ab4611564f460392263ed 100644 --- a/.gitignore +++ b/.gitignore @@ -156,4 +156,6 @@ cython_debug/ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. \ No newline at end of file +# option (not recommended) you can uncomment the following to ignore the entire idea folder. + +/.vscode \ No newline at end of file diff --git a/app.py b/app.py index be8a7ad57b5dc5100affc2fda86fc1f482aa34b8..e4c6b74477ad6b67a80285e3a17e257fd8bd458a 100644 --- a/app.py +++ b/app.py @@ -3,13 +3,16 @@ import spaces import os import time import json +import numpy as np +import av from PIL import Image import functools from transformers import AutoProcessor, Idefics2ForConditionalGeneration from models.conversation import conv_templates from typing import List -processor = AutoProcessor.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora") -model = Idefics2ForConditionalGeneration.from_pretrained("MFuyu/mantis-8b-idefics2-video-eval_8192_lora") +processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-95k-mantis-2epoch_4096") +model = Idefics2ForConditionalGeneration.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-95k-mantis-2epoch_4096") +MAX_NUM_FRAMES = 24 conv_template = conv_templates["idefics_2"] with open("./examples/all_subsets.json", 'r') as f: @@ -18,8 +21,33 @@ with open("./examples/all_subsets.json", 'r') as f: for item in examples: video_id = item['images'][0].split("_")[0] item['images'] = [os.path.join("./examples", video_id, x) for x in item['images']] - -prompt = "Suppose you are an expert in judging and evaluating the quality of AI-generated videos, \nplease watch the following frames of a given video and see the text prompt for generating the video, \nthen give scores from 7 different dimensions:\n(1) visual quality, \n(2) object consistency,\n(3) dynamic degree,\n(4) motion smoothness,\n(5) text-to-video alignment,\n(6) factual consistency, \n(7) overall score\nfor each dimension, output a number from [1,2,3], in which '1' stands for 'Bad', '2' stands for 'Average', '3' stands for 'Good'.\nHere is an output example: \nvisual quality: 3\nobject consistency: 2 \ndynamic degree: 2\nmotion smoothness: 1\ntext-to-video alignment: 1\nfactual consistency: 2\noverall score: 1\n\nFor this item, the text prompt is the beautiful girl, long hair,walk on the sity street, red cloth ,\nall the frames of video are as follows: \n\n" + item['video'] = os.path.join("./examples", item['video']) + +VIDEO_EVAL_PROMPT = """ +Suppose you are an expert in judging and evaluating the quality of AI-generated videos, +please watch the following frames of a given video and see the text prompt for generating the video, +then give scores from 7 different dimensions: +(1) visual quality: the quality of the video in terms of clearness, resolution, brightness, and color +(2) object consistency, the consistency of objects or humans in video +(3) dynamic degree, the degree of dynamic changes +(4) motion smoothness, the smoothness of motion or movements +(5) text-to-video alignment, the alignment between the text prompt and the video content +(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge +(7) overall score, the overall quality of the video +for each dimension, output a number from [1,2,3], in which '1' is 'Bad', '2' is 'Average', '3' is 'Good'. +Here is an output example: +visual quality: 3 +object consistency: 2 +dynamic degree: 2 +motion smoothness: 1 +text-to-video alignment: 1 +factual consistency: 2 +overall score: 1 + +For this video, the text prompt is "{text_prompt}", +all the frames of video are as follows: + +""" @spaces.GPU def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs): global processor, model @@ -32,6 +60,14 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs): idefics_2_message = [] cur_img_idx = 0 + cur_vid_idx = 0 + all_videos = [x for x in images if isinstance(x, list)] + flatten_images = [] + for x in images: + if isinstance(x, list): + flatten_images.extend(x) + else: + flatten_images.append(x) print(history) for i, message in enumerate(history): @@ -41,6 +77,11 @@ def generate(text:str, images:List[Image.Image], history: List[dict], **kwargs): "content": [] }) message_text = message["text"] + num_video_tokens_in_text = message_text.count("