Spaces:
Running
Running
import cv2 | |
import os | |
import gradio as gr | |
import requests | |
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
from PIL import Image | |
import torch | |
import uuid | |
# Load Models | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Model 1: ViT-GPT2 | |
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device) | |
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
# Model 2: FuseCap | |
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap") | |
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device) | |
# Model 3: BLIP Large | |
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) | |
# Frame Extraction and Captioning Logic | |
def process_video(video_path): | |
vidObj = cv2.VideoCapture(video_path) | |
print(vidObj) | |
count = 0 | |
success = True | |
frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []} | |
print("LOGX") | |
while success: | |
success, frame = vidObj.read() | |
print("LOGY") | |
print(success) | |
print(frame) | |
if not success: | |
break | |
# Process every 20th frame | |
if count % 20 == 0: | |
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) | |
# Model 1: ViT-GPT2 | |
pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device) | |
output_ids = model1.generate(pixel_values, max_length=16, num_beams=4) | |
caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True) | |
frame_captions["Model 1"].append(caption1) | |
# Model 2: FuseCap | |
inputs = processor2(image, "a picture of ", return_tensors="pt").to(device) | |
out2 = model2.generate(**inputs, num_beams=3) | |
caption2 = processor2.decode(out2[0], skip_special_tokens=True) | |
frame_captions["Model 2"].append(caption2) | |
# Model 3: BLIP Large | |
inputs3 = processor3(image, return_tensors="pt").to(device) | |
out3 = model3.generate(**inputs3) | |
caption3 = processor3.decode(out3[0], skip_special_tokens=True) | |
frame_captions["Model 3"].append(caption3) | |
count += 1 | |
vidObj.release() | |
return frame_captions | |
# Gradio Interface | |
def generate_captions(video): | |
print("LOG1") | |
captions = process_video(video) | |
print("LOG PO") | |
result = "" | |
for model_name, model_captions in captions.items(): | |
result += f"### {model_name}\n" | |
result += "\n".join(f"- {caption}" for caption in model_captions) | |
result += "\n\n" | |
print("LOG KONIEc") | |
return result | |
with gr.Blocks() as demo: | |
gr.Markdown("# Video Captioning with Multiple Models 🎥") | |
gr.Markdown("Upload a video to generate captions for its frames using three different models.") | |
video_input = gr.Video(label="Upload Video") | |
output = gr.Textbox(label="Generated Captions", lines=20) | |
submit_button = gr.Button("Generate Captions") | |
submit_button.click( | |
fn=generate_captions, | |
inputs=video_input, | |
outputs=output, | |
) | |
if __name__ == "__main__": | |
demo.launch() | |