Spaces:

matjarm
/

model-comparison

Sleeping

App Files Files Community

model-comparison / app.py

matjarm

test

dbffd62 7 months ago

raw

history blame contribute delete

3.5 kB

	import cv2
	import os
	import gradio as gr
	import requests
	from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
	from transformers import BlipProcessor, BlipForConditionalGeneration
	from PIL import Image
	import torch
	import uuid

	# Load Models
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Model 1: ViT-GPT2
	model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
	feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

	# Model 2: FuseCap
	processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
	model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device)

	# Model 3: BLIP Large
	processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)


	# Frame Extraction and Captioning Logic
	def process_video(video_path):
	vidObj = cv2.VideoCapture(video_path)
	print(vidObj)
	count = 0
	success = True
	frame_captions = {"Model 1": [], "Model 2": [], "Model 3": []}
	print("LOGX")
	while success:
	success, frame = vidObj.read()
	print("LOGY")
	print(success)
	print(frame)
	if not success:
	break

	# Process every 20th frame
	if count % 20 == 0:
	image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

	# Model 1: ViT-GPT2
	pixel_values = feature_extractor1(images=[image], return_tensors="pt").pixel_values.to(device)
	output_ids = model1.generate(pixel_values, max_length=16, num_beams=4)
	caption1 = tokenizer1.decode(output_ids[0], skip_special_tokens=True)
	frame_captions["Model 1"].append(caption1)

	# Model 2: FuseCap
	inputs = processor2(image, "a picture of ", return_tensors="pt").to(device)
	out2 = model2.generate(**inputs, num_beams=3)
	caption2 = processor2.decode(out2[0], skip_special_tokens=True)
	frame_captions["Model 2"].append(caption2)

	# Model 3: BLIP Large
	inputs3 = processor3(image, return_tensors="pt").to(device)
	out3 = model3.generate(**inputs3)
	caption3 = processor3.decode(out3[0], skip_special_tokens=True)
	frame_captions["Model 3"].append(caption3)

	count += 1

	vidObj.release()
	return frame_captions


	# Gradio Interface
	def generate_captions(video):
	print("LOG1")
	captions = process_video(video)
	print("LOG PO")
	result = ""
	for model_name, model_captions in captions.items():
	result += f"### {model_name}\n"
	result += "\n".join(f"- {caption}" for caption in model_captions)
	result += "\n\n"
	print("LOG KONIEc")
	return result



	with gr.Blocks() as demo:
	gr.Markdown("# Video Captioning with Multiple Models 🎥")
	gr.Markdown("Upload a video to generate captions for its frames using three different models.")
	video_input = gr.Video(label="Upload Video")
	output = gr.Textbox(label="Generated Captions", lines=20)
	submit_button = gr.Button("Generate Captions")

	submit_button.click(
	fn=generate_captions,
	inputs=video_input,
	outputs=output,
	)

	if __name__ == "__main__":
	demo.launch()