Multimodal_Demo / models /clip_model.py
Sirus1's picture
Duplicate from TencentARC/VLog
6f6830f
import os
import cv2
import pdb
import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPVisionModelWithProjection
from transformers import logging
logging.set_verbosity_error()
class FeatureExtractor():
def __init__(self, args):
self.device = args.feature_extractor_device
self.beta = args.beta
self.processor = CLIPProcessor.from_pretrained(args.feature_extractor)
self.model = CLIPVisionModelWithProjection.from_pretrained(args.feature_extractor).to(self.device)
self.data_dir = args.data_dir
self.tmp_dir = args.tmp_dir
def __call__(self, video_path, video_id):
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_length = frame_count / fps
sample_rate = int(fps) * self.beta
save_path = os.path.join(self.tmp_dir, video_id + '.npz')
if os.path.exists(save_path):
data = np.load(save_path)
clip_features = data['features']
return clip_features, video_length
clip_features = []
print("Extract the clip feature.")
while True:
ret, frame = cap.read()
if not ret:
break
if cap.get(cv2.CAP_PROP_POS_FRAMES) % sample_rate == 0:
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
inputs = self.processor(images=image, return_tensors="pt").pixel_values
inputs = inputs.to(self.device)
with torch.no_grad():
feat = self.model(inputs)['image_embeds']
clip_features.append(feat.cpu().numpy())
print("Finished.")
clip_features = np.concatenate(clip_features, axis=0)
np.savez_compressed(save_path, features=clip_features)
return clip_features, video_length