|
|
| import gradio as gr |
| from transformers import BlipProcessor, BlipForConditionalGeneration |
| from PIL import Image |
| import cv2 |
| import os |
|
|
| |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
| def process_video(video_path): |
| |
| os.makedirs("frames", exist_ok=True) |
|
|
| |
| vidcap = cv2.VideoCapture(video_path) |
| fps = vidcap.get(cv2.CAP_PROP_FPS) |
| total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) |
| duration = total_frames / fps |
|
|
| |
| segments = [] |
| current = 0 |
| while current < duration: |
| start = current |
| end = min(current + 4, duration) |
| segments.append((start, end)) |
| current += 4 |
|
|
| descriptions = [] |
| for i, (start, end) in enumerate(segments): |
| |
| center_time = (start + end) / 2 |
| center_frame = int(center_time * fps) |
| vidcap.set(cv2.CAP_PROP_POS_FRAMES, center_frame) |
| success, frame = vidcap.read() |
| if success: |
| img_path = f"frames/frame_{i}.jpg" |
| cv2.imwrite(img_path, frame) |
| |
| pil_image = Image.open(img_path).convert("RGB") |
| inputs = processor(images=pil_image, return_tensors="pt") |
| out = model.generate(**inputs) |
| caption = processor.decode(out[0], skip_special_tokens=True) |
| descriptions.append(f"Segmento {i+1} ({start:.1f}-{end:.1f}s): {caption}") |
|
|
| vidcap.release() |
|
|
| |
| prompts = [] |
| for j in range(0, len(descriptions), 2): |
| combined = " ".join(descriptions[j:j+2]) |
| prompts.append(f"Prompt {j//2 +1}: {combined}") |
|
|
| return "\n".join(descriptions) + "\n\n" + "\n".join(prompts) |
|
|
| iface = gr.Interface( |
| fn=process_video, |
| inputs=gr.Video(), |
| outputs="text", |
| title="Video Analyzer with BLIP (CPU Friendly)", |
| description="Faz análise de frames centrais de segmentos de 4s e gera prompts combinados de 8s." |
| ) |
|
|
| iface.launch() |
|
|