Spaces:

akhaliq
/

Video_Search_CLIP

Runtime error

Ahsen Khaliq

Update app.py

2b6ee25 over 2 years ago

No virus

3.27 kB

	import os
	os.system("pip install gradio==2.3.6")
	import cv2
	from PIL import Image
	import clip
	import torch
	import math
	import numpy as np
	import torch
	import datetime
	import gradio as gr


	# Load the open CLIP model
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model, preprocess = clip.load("ViT-B/32", device=device)



	def inference(video, text):
	# The frame images will be stored in video_frames
	video_frames = []
	# Open the video file

	capture = cv2.VideoCapture(video)
	fps = capture.get(cv2.CAP_PROP_FPS)

	current_frame = 0
	# Read the current frame
	ret, frame = capture.read()
	while capture.isOpened() and ret:
	ret,frame = capture.read()
	print('Read a new frame: ', ret)
	current_frame += 1
	if ret:
	video_frames.append(Image.fromarray(frame[:, :, ::-1]))


	# Print some statistics
	print(f"Frames extracted: {len(video_frames)}")


	# You can try tuning the batch size for very large videos, but it should usually be OK
	batch_size = 256
	batches = math.ceil(len(video_frames) / batch_size)

	# The encoded features will bs stored in video_features
	video_features = torch.empty([0, 512], dtype=torch.float16).to(device)

	# Process each batch
	for i in range(batches):
	print(f"Processing batch {i+1}/{batches}")

	# Get the relevant frames
	batch_frames = video_frames[ibatch_size : (i+1)batch_size]

	# Preprocess the images for the batch
	batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)

	# Encode with CLIP and normalize
	with torch.no_grad():
	batch_features = model.encode_image(batch_preprocessed)
	batch_features /= batch_features.norm(dim=-1, keepdim=True)

	# Append the batch to the list containing all features
	video_features = torch.cat((video_features, batch_features))

	# Print some stats
	print(f"Features: {video_features.shape}")


	search_query=text
	display_heatmap=False
	display_results_count=1
	# Encode and normalize the search query using CLIP
	with torch.no_grad():
	text_features = model.encode_text(clip.tokenize(search_query).to(device))
	text_features /= text_features.norm(dim=-1, keepdim=True)

	# Compute the similarity between the search query and each frame using the Cosine similarity
	similarities = (100.0 * video_features @ text_features.T)
	values, best_photo_idx = similarities.topk(display_results_count, dim=0)


	for frame_id in best_photo_idx:
	frame = video_frames[frame_id]
	# Find the timestamp in the video and display it
	seconds = round(frame_id.cpu().numpy()[0]/fps)
	return frame,f"Found at {str(datetime.timedelta(seconds=seconds))}"

	title = "Video Search"
	description = "Gradio demo for using OpenAI's CLIP to search inside videos. To use it, simply upload your video and add your text. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://github.com/haltakov/natural-language-youtube-search'>Github Repo</a></p>"

	gr.Interface(
	inference,
	["video","text"],
	[gr.outputs.Image(type="pil", label="Output"),"text"],
	title=title,
	description=description,
	article=article,
	enable_queue=True
	).launch(debug=True)