import gradio as gr import torch from transformers import AutoModel, AutoTokenizer from PIL import Image from decord import VideoReader, cpu import base64 import io import spaces import time # Load model model_path = 'openbmb/MiniCPM-V-2_6' model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) model = model.to(device='cuda') tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model.eval() MAX_NUM_FRAMES = 64 def encode_image(image): if not isinstance(image, Image.Image): image = Image.open(image).convert("RGB") max_size = 448*16 if max(image.size) > max_size: w,h = image.size if w > h: new_w = max_size new_h = int(h * max_size / w) else: new_h = max_size new_w = int(w * max_size / h) image = image.resize((new_w, new_h), resample=Image.BICUBIC) return image def encode_video(video_path): vr = VideoReader(video_path, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > MAX_NUM_FRAMES: frame_idx = frame_idx[:MAX_NUM_FRAMES] video = vr.get_batch(frame_idx).asnumpy() video = [Image.fromarray(v.astype('uint8')) for v in video] video = [encode_image(v) for v in video] return video @spaces.GPU def analyze_video(prompt, video): start_time = time.time() if isinstance(video, str): video_path = video else: video_path = video.name encoded_video = encode_video(video_path) context = [ {"role": "user", "content": [prompt] + encoded_video} ] params = { 'sampling': True, 'top_p': 0.8, 'top_k': 100, 'temperature': 0.7, 'repetition_penalty': 1.05, "max_new_tokens": 2048, "max_inp_length": 4352, "use_image_id": False, "max_slice_nums": 1 if len(encoded_video) > 16 else 2 } response = model.chat(image=None, msgs=context, tokenizer=tokenizer, **params) end_time = time.time() processing_time = end_time - start_time return f"Analysis Result:\n{response}\n\nProcessing Time: {processing_time:.2f} seconds" with gr.Blocks() as demo: gr.Markdown("# Video Analyzer") with gr.Row(): with gr.Column(): prompt_input = gr.Textbox(label="Prompt") video_input = gr.Video(label="Upload Video") with gr.Column(): output = gr.Textbox(label="Analysis Result and Processing Time") analyze_button = gr.Button("Analyze Video") analyze_button.click(fn=analyze_video, inputs=[prompt_input, video_input], outputs=output) demo.launch()