Ahsen Khaliq
Update app.py
2b6ee25
raw history blame
No virus
3.27 kB
import os
os.system("pip install gradio==2.3.6")
import cv2
from PIL import Image
import clip
import torch
import math
import numpy as np
import torch
import datetime
import gradio as gr
# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
def inference(video, text):
# The frame images will be stored in video_frames
video_frames = []
# Open the video file
capture = cv2.VideoCapture(video)
fps = capture.get(cv2.CAP_PROP_FPS)
current_frame = 0
# Read the current frame
ret, frame = capture.read()
while capture.isOpened() and ret:
ret,frame = capture.read()
print('Read a new frame: ', ret)
current_frame += 1
if ret:
video_frames.append(Image.fromarray(frame[:, :, ::-1]))
# Print some statistics
print(f"Frames extracted: {len(video_frames)}")
# You can try tuning the batch size for very large videos, but it should usually be OK
batch_size = 256
batches = math.ceil(len(video_frames) / batch_size)
# The encoded features will bs stored in video_features
video_features = torch.empty([0, 512], dtype=torch.float16).to(device)
# Process each batch
for i in range(batches):
print(f"Processing batch {i+1}/{batches}")
# Get the relevant frames
batch_frames = video_frames[i*batch_size : (i+1)*batch_size]
# Preprocess the images for the batch
batch_preprocessed = torch.stack([preprocess(frame) for frame in batch_frames]).to(device)
# Encode with CLIP and normalize
with torch.no_grad():
batch_features = model.encode_image(batch_preprocessed)
batch_features /= batch_features.norm(dim=-1, keepdim=True)
# Append the batch to the list containing all features
video_features = torch.cat((video_features, batch_features))
# Print some stats
print(f"Features: {video_features.shape}")
search_query=text
display_heatmap=False
display_results_count=1
# Encode and normalize the search query using CLIP
with torch.no_grad():
text_features = model.encode_text(clip.tokenize(search_query).to(device))
text_features /= text_features.norm(dim=-1, keepdim=True)
# Compute the similarity between the search query and each frame using the Cosine similarity
similarities = (100.0 * video_features @ text_features.T)
values, best_photo_idx = similarities.topk(display_results_count, dim=0)
for frame_id in best_photo_idx:
frame = video_frames[frame_id]
# Find the timestamp in the video and display it
seconds = round(frame_id.cpu().numpy()[0]/fps)
return frame,f"Found at {str(datetime.timedelta(seconds=seconds))}"
title = "Video Search"
description = "Gradio demo for using OpenAI's CLIP to search inside videos. To use it, simply upload your video and add your text. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/haltakov/natural-language-youtube-search'>Github Repo</a></p>"
gr.Interface(
inference,
["video","text"],
[gr.outputs.Image(type="pil", label="Output"),"text"],
title=title,
description=description,
article=article,
enable_queue=True
).launch(debug=True)