Spaces:

namnh2002
/

video-summarization_timesformer

Sleeping

video-summarization_timesformer / utils.py

nam_nguyenhoai_AI

update src

4808241 6 months ago

2.33 kB

	from transformers import TimesformerModel, VideoMAEImageProcessor
	import torch
	import cv2
	import numpy as np
	from torchvision.transforms import Lambda
	from pytorchvideo.transforms import (
	Normalize,
	)
	from torchvision.transforms import (
	Lambda,
	)
	import os
	from os.path import isfile, join, basename

	def extract_features(frames, device, model, image_processor):
	# Convert frames to tensor
	frames_tensor = torch.stack([torch.from_numpy(frame) for frame in frames])
	# Change the order of the tensor to (num_frames, channel, height, width)
	frames_tensor = frames_tensor.permute(3, 0, 1, 2).to(device)

	# Get the mean and std of the image processor
	mean = image_processor.image_mean
	std = image_processor.image_std

	# Normalize frames
	frames_tensor = Lambda(lambda x: x / 255.0)(frames_tensor)
	frames_tensor = Normalize(mean, std)(frames_tensor)

	# Change the order of the tensor to (num_frames, channel, height, width) and add a batch dimension
	frames_tensor = frames_tensor.permute(1, 0, 2, 3).unsqueeze(0)

	# Load the model to the device
	model.to(device)
	model.eval()
	outputs = model(frames_tensor)

	# Get the output after the Transformer Encoder (MLP head)
	final_output = outputs[0][:, 0]

	return final_output

	def to_video(selected_frames, frames, output_path, video_fps):

	print("MP4 Format.")
	# Write the selected frames to a video
	video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), video_fps, (frames[0].shape[1], frames[0].shape[0]))

	# selected_frames is a list of indices of frames
	for idx in selected_frames:
	video_writer.write(frames[idx])

	video_writer.release()
	print("Completed summarizing the video (wait for a moment to load).")

	def load_model():
	try:
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	model = TimesformerModel.from_pretrained(f"facebook/timesformer-base-finetuned-k600")
	processor=VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")
	return model, processor, DEVICE

	except Exception as e:
	print(e)

	def sum_of_squared_difference(vector1, vector2):
	squared_diff = np.square(vector1 - vector2)
	sum_squared_diff = np.sum(squared_diff)
	return sum_squared_diff