matjarm's picture
test
dbffd62
import cv2
import os
import gradio as gr
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import uuid
# Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Model 1: ViT-GPT2
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# Model 2: FuseCap
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)
# Model 3: BLIP Large
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
# Frame Extraction and Captioning Logic
def process_video(video_path):
vidObj = cv2.VideoCapture(video_path)
print(vidObj)
count = 0
success = True
frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
print("LOGX")
while success:
success, frame = vidObj.read()
print("LOGY")
print(success)
print(frame)
if not success:
break
# Process every 20th frame
if count % 20 == 0:
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# Model 1: ViT-GPT2
pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
frame_captions["Model 1"].append(caption1)
# Model 2: FuseCap
inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
out2 = model2.generate(**inputs, num_beams=3)
caption2 = processor2.decode(out2[0], skip_special_tokens=True)
frame_captions["Model 2"].append(caption2)
# Model 3: BLIP Large
inputs3 = processor3(image, return_tensors="pt").to(device)
out3 = model3.generate(**inputs3)
caption3 = processor3.decode(out3[0], skip_special_tokens=True)
frame_captions["Model 3"].append(caption3)
count += 1
vidObj.release()
return frame_captions
# Gradio Interface
def generate_captions(video):
print("LOG1")
captions = process_video(video)
print("LOG PO")
result = ""
for model_name, model_captions in captions.items():
result += f"### {model_name}\n"
result += "\n".join(f"- {caption}" for caption in model_captions)
result += "\n\n"
print("LOG KONIEc")
return result
with gr.Blocks() as demo:
gr.Markdown("# Video Captioning with Multiple Models 🎥")
gr.Markdown("Upload a video to generate captions for its frames using three different models.")
video_input = gr.Video(label="Upload Video")
output = gr.Textbox(label="Generated Captions", lines=20)
submit_button = gr.Button("Generate Captions")
submit_button.click(
fn=generate_captions,
inputs=video_input,
outputs=output,
)
if __name__ == "__main__":
demo.launch()