Spaces:

spark-ds549
/

fal2022-videoanalysis-v2

Build error

fal2022-videoanalysis-v2 / slowfast.py

Frank Pacini

copy repo

6155c0e almost 2 years ago

8.2 kB

	import numpy as np
	import pandas as pd
	import cv2
	import torch
	import warnings
	from detectron2.config import get_cfg
	from detectron2 import model_zoo
	from detectron2.engine import DefaultPredictor
	import ffmpeg
	import pytorchvideo
	from pytorchvideo.transforms.functional import (
	uniform_temporal_subsample,
	short_side_scale_with_boxes,
	clip_boxes_to_image
	)
	from torchvision.transforms._functional_video import normalize
	from pytorchvideo.data.ava import AvaLabeledVideoFramePaths
	from pytorchvideo.models.hub import slowfast_r50_detection # Another option is slow_r50_detection
	from visualization import VideoVisualizer


	# This method takes in an image and generates the bounding boxes for people in the image.
	def get_person_bboxes(inp_img, predictor):
	predictions = predictor(inp_img.cpu().detach().numpy())['instances'].to('cpu')
	boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
	scores = predictions.scores if predictions.has("scores") else None
	classes = np.array(predictions.pred_classes.tolist() if predictions.has("pred_classes") else None)
	predicted_boxes = boxes[np.logical_and(classes==0, scores>0.75 )].tensor.cpu() # only person
	return predicted_boxes


	def ava_inference_transform(
	clip,
	boxes,
	num_frames = 32, # 4 if using slowfast_r50_detection, change this to 32
	crop_size = 256,
	data_mean = [0.45, 0.45, 0.45],
	data_std = [0.225, 0.225, 0.225],
	slow_fast_alpha = 4, # if using slowfast_r50_detection, change None to 4
	device = 'cpu'):

	boxes = np.array(boxes)
	ori_boxes = boxes.copy()

	# Image [0, 255] -> [0, 1].
	clip = uniform_temporal_subsample(clip, num_frames)
	clip = clip.float()
	clip = clip / 255.0

	height, width = clip.shape[2], clip.shape[3]
	# The format of boxes is [x1, y1, x2, y2]. The input boxes are in the
	# range of [0, width] for x and [0,height] for y
	boxes = clip_boxes_to_image(boxes, height, width)

	# Resize short side to crop_size. Non-local and STRG uses 256.
	clip, boxes = short_side_scale_with_boxes(clip, size=crop_size, boxes=boxes)

	# Normalize images by mean and std.
	clip = normalize(clip, np.array(data_mean, dtype=np.float32), np.array(data_std, dtype=np.float32))

	boxes = clip_boxes_to_image(boxes, clip.shape[2], clip.shape[3])

	# Incase of slowfast, generate both pathways
	if slow_fast_alpha is not None:
	fast_pathway = clip
	# Perform temporal sampling from the fast pathway.
	slow_pathway = torch.index_select(clip, 1, torch.linspace(
	0, clip.shape[1] - 1, clip.shape[1] // slow_fast_alpha).long())
	clip = [slow_pathway.unsqueeze(0).to(device), fast_pathway.unsqueeze(0).to(device)]

	return clip, torch.from_numpy(boxes), ori_boxes

	# get video info
	def with_opencv(filename):
	video = cv2.VideoCapture(filename)
	frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
	fps = video.get(cv2.CAP_PROP_FPS)
	s = round(frame_count / fps)
	video.release()
	return int(s), fps


	def slow_fast_train(file_path, gpu=False):
	device = 'cuda' if gpu else 'cpu'
	top_k = 1

	video_model = slowfast_r50_detection(True) # Another option is slow_r50_detection(True)
	video_model = video_model.eval().to(device)
	cfg = get_cfg()
	cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
	cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.55 # set threshold for this model
	cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
	cfg.MODEL.DEVICE = device
	predictor = DefaultPredictor(cfg)
	# Create an id to label name mapping
	label_map, allowed_class_ids = AvaLabeledVideoFramePaths.read_label_map('ava_action_list.pbtxt')
	# Create a video visualizer that can plot bounding boxes and visualize actions on bboxes.
	video_visualizer = VideoVisualizer(81, label_map, top_k=top_k, mode="thres",thres=0.5) #get top3 predictions show in each bounding box

	#preprocess video data
	encoded_vid = pytorchvideo.data.encoded_video.EncodedVideo.from_path(file_path)

	# Video predictions are generated each frame/second for the wholevideo.
	total_sec, fps = with_opencv(file_path)
	time_stamp_range = range(0, total_sec) # time stamps in video for which clip is sampled
	clip_duration = 1.0 # Duration of clip used for each inference step.
	gif_imgs = []
	xleft, ytop, xright, ybottom = [], [], [], []
	labels = []
	time_frame = []
	scores = []

	for time_stamp in time_stamp_range:

	# Generate clip around the designated time stamps
	inp_imgs = encoded_vid.get_clip(
	time_stamp - clip_duration/2.0,
	time_stamp + clip_duration/2.0)
	inp_imgs = inp_imgs['video']

	#if time_stamp % 15 == 0:
	# Generate people bbox predictions using Detectron2's off the self pre-trained predictor
	# We use the the middle image in each clip to generate the bounding boxes.
	inp_img = inp_imgs[:,inp_imgs.shape[1]//2,:,:]
	inp_img = inp_img.permute(1,2,0)

	# Predicted boxes are of the form List[(x_1, y_1, x_2, y_2)]
	predicted_boxes = get_person_bboxes(inp_img, predictor)
	if len(predicted_boxes) == 0:
	print("Skipping clip no frames detected at time stamp: ", time_stamp)
	continue

	# Preprocess clip and bounding boxes for video action recognition.
	inputs, inp_boxes, _ = ava_inference_transform(inp_imgs, predicted_boxes.numpy(), device=device)
	# Prepend data sample id for each bounding box.
	# For more details refere to the RoIAlign in Detectron2
	inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0],1), inp_boxes], dim=1)

	# Generate actions predictions for the bounding boxes in the clip.
	# The model here takes in the pre-processed video clip and the detected bounding boxes.
	preds = video_model(inputs, inp_boxes.to(device)) #change inputs to inputs.unsqueeze(0).to(device) if using slow_r50

	preds = preds.to('cpu')
	# The model is trained on AVA and AVA labels are 1 indexed so, prepend 0 to convert to 0 index.
	preds = torch.cat([torch.zeros(preds.shape[0],1), preds], dim=1)

	# Plot predictions on the video and save for later visualization.
	inp_imgs = inp_imgs.permute(1,2,3,0)
	inp_imgs = inp_imgs/255.0
	out_img_pred = video_visualizer.draw_clip_range(inp_imgs, preds, predicted_boxes)
	gif_imgs += out_img_pred

	#format of bboxes(x_left, y_top, x_right, y_bottom)
	predicted_boxes_lst = predicted_boxes.tolist()
	topscores, topclasses = torch.topk(preds, k=1)
	topscores, topclasses = topscores.tolist(), topclasses.tolist()
	topclasses = np.concatenate(topclasses)
	topscores = np.concatenate(topscores)

	#add top 1 prediction of behaviors in each time step
	for i in range(len(predicted_boxes_lst)):
	xleft.append(predicted_boxes_lst[i][0])
	ytop.append(predicted_boxes_lst[i][1])
	xright.append(predicted_boxes_lst[i][2])
	ybottom.append(predicted_boxes_lst[i][3])
	labels.append(label_map.get(topclasses[i]))
	time_frame.append(time_stamp)
	scores.append(topscores[i])

	print("Finished generating predictions.")
	# Generate Metadata file
	metadata = pd.DataFrame()
	metadata['frame'] = time_frame
	metadata['x_left'] = xleft
	metadata['y_top'] = ytop
	metadata['x_right'] = xright
	metadata['y_bottom'] = ybottom
	metadata['label'] = labels
	metadata['confidence'] = scores

	height, width = gif_imgs[0].shape[0], gif_imgs[0].shape[1]
	video_save_path = 'activity_recognition.mp4'
	video = cv2.VideoWriter(video_save_path, cv2.VideoWriter_fourcc(*'mp4v'), int(fps), (width, height))

	for image in gif_imgs:
	img = (255*image).astype(np.uint8)
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	video.write(img)
	video.release()

	return video_save_path, metadata