import gradio as gr import argparse import datetime import json import os import time import gradio as gr import requests from PIL import Image from q_align.model.builder import load_pretrained_model from q_align.conversation import (default_conversation, conv_templates, SeparatorStyle) from q_align.constants import LOGDIR from q_align.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg) from q_align.evaluate.scorer import QAlignScorer, QAlignAestheticScorer, QAlignVideoScorer import gradio as gr def load_video(video_file): from decord import VideoReader vr = VideoReader(video_file) # Get video frame rate fps = vr.get_avg_fps() # Calculate frame indices for 1fps frame_indices = [int(fps * i) for i in range(int(len(vr) / fps))] frames = vr.get_batch(frame_indices).asnumpy() return [Image.fromarray(frames[i]) for i in range(int(len(vr) / fps))] pretrained="q-future/one-align" device="cuda:0" tokenizer, model, image_processor, _ = load_pretrained_model(pretrained, None, "mplug_owl2", device=device) iqa_scorer = QAlignScorer(tokenizer=tokenizer, model=model, image_processor=image_processor) iaa_scorer = QAlignAestheticScorer(tokenizer=tokenizer, model=model, image_processor=image_processor) vqa_scorer = QAlignVideoScorer(tokenizer=tokenizer, model=model, image_processor=image_processor) scorers = {"Image Aesthetics (IAA)": iaa_scorer, "Image Quality (IQA)": iqa_scorer, "Video Quality (VQA)": vqa_scorer} LEVELS = ["excellent (5)", "good (4)", "fair (3)", "poor (2)", "bad (1)"] scores = [5,4,3,2,1] def image_classifier(input_img, input_vid, scorer_type): if scorer_type is None: scorer_type = "Image Quality (IQA)" this_scorer = scorers[scorer_type] if input_vid is not None: input_ = load_video(input_vid) elif input_img is not None: input_ = [input_img] if "Video" in scorer_type: input_ = [input_] probs = this_scorer(input_).mean(0).tolist() prob_dict = {LEVEL: prob for LEVEL, prob in zip(LEVELS, probs)} score = sum([prob * score for score, prob in zip(scores, probs)]) return prob_dict, score title_markdown = ("""

Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels

One Unified Model for Visual scoring.

Haoning Wu1*+, Zicheng Zhang2*, Weixia Zhang2, Chaofeng Chen1, Liang Liao1, Chunyi Li2,
Yixuan Gao2, Annan Wang1, Erli Zhang1, Wenxiu Sun3, Qiong Yan3, Xiongkuo Min2, Guangtao Zhai2#, Weisi Lin1#
1Nanyang Technological University, 2Shanghai Jiao Tong University, 3Sensetime Research
*Equal contribution. +Project Lead. #Corresponding author(s).

If you like the OneScorer, please give us a star ✨ on GitHub for latest update.

""") input_img = gr.Image(type='pil', label="Upload an Image") input_vid = gr.Video(label="Upload a Video (will INGORE the image if a video is uploaded)", info="If a video is uploaded, the image uploaded will be ignored.") labels = gr.Label(label="Probabilities of rating levels:") number = gr.Number(label="Output score:", info="Range in [1,5]. Higher is better.") demo = gr.Interface(fn=image_classifier, inputs=[input_img, input_vid, gr.Radio(["Image Aesthetics (IAA)", "Image Quality (IQA)", "Video Quality (VQA)"], label="Task", info="Which Scorer will you need?"),], outputs=[labels, number], title="OneScorer", description=title_markdown) demo.launch(share=True)