Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
from src.model_loader import load_model | |
from src.video_utils import process_video_for_internvl3 | |
from src.ar_prompts import generate_conversation_questions | |
tokenizer, model = load_model() | |
def evaluate_ar_multi_turn(video): | |
pixel_values, num_patches_list, image_prefix = process_video_for_internvl3(video) | |
conversation = generate_conversation_questions(include_descriptions=True) | |
history = None | |
visible_outputs = [] | |
for i, question in enumerate(conversation): | |
prompt = image_prefix + question if i == 0 else question | |
output, history = model.chat( | |
tokenizer, | |
pixel_values, | |
prompt, | |
generation_config={"max_new_tokens": 1024}, | |
num_patches_list=num_patches_list, | |
history=history, | |
return_history=True | |
) | |
# 仅保留评测和拓展部分的回答(即从第3轮开始) | |
if i >= 2: | |
visible_outputs.append(output) | |
# 多个输出拼接成文本显示 | |
return "\n\n".join(visible_outputs) | |
gr.Interface( | |
fn=evaluate_ar_multi_turn, | |
inputs=gr.Video(label="Upload your AR video"), | |
outputs="text", | |
title="InternVL3 AR Evaluation (Multi-turn)", | |
description="Upload a short AR video clip. The model will sample frames and conduct a multi-turn dialogue to assess occlusion/rendering/placement/lighting." | |
).launch() | |