Spaces:

sindhuhegde
/

gestsync

Running on Zero

App Files Files Community

sindhuhegde commited on Aug 26, 2024

Commit

4b29652

1 Parent(s): a0b74a7

Update app

Browse files

Files changed (2) hide show

.DS_Store +0 -0
app.py +115 -77

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -196,7 +196,6 @@ def resample_video(video_file, video_fname, result_folder):
 	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
 	# Resample the video to 25 fps
-	# status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i {} -q:v 1 -filter:v fps=25 {}'.format(video_file, video_file_25fps), shell=True)
 	status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
 	if status != 0:
 		msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
@@ -343,7 +342,7 @@ def check_visible_gestures(kp_dict):
 			hand_count += 1
-	if hand_count/len(keypoints) > 0.7 or pose_count/len(keypoints) > 0.7:
 		msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
 		return msg
@@ -351,7 +350,7 @@ def check_visible_gestures(kp_dict):
 	return "success"
-def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, width=480, height=270):
 	'''
 	This function masks the faces using the keypoints extracted from the frames
@@ -370,47 +369,56 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
 		- msg (string) : Message to be returned
 	'''
-	# Face indices to extract the face-coordinates needed for masking
-	face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172,
-					176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]
-	input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
-	print("Input keypoints: ", len(input_keypoints))
 	print("Creating masked input frames...")
-	input_frames_masked = []
-	for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):
-		img = input_frames[i]
-		face = frame_kp_dict["face"]
-		if face is None:
 			img = cv2.resize(img, (width, height))
 			masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
-		else:
-			face_kps = []
-			for idx in range(len(face)):
-				if idx in face_oval_idx:
-					x, y = int(face[idx]["x"]*resolution[1]), int(face[idx]["y"]*resolution[0])
-					face_kps.append((x,y))
-			face_kps = np.array(face_kps)
-			x1, y1 = min(face_kps[:,0]), min(face_kps[:,1])
-			x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
-			masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)
-		if masked_img.shape[0] != width or masked_img.shape[1] != height:
-			masked_img = cv2.resize(masked_img, (width, height))
-		input_frames_masked.append(masked_img)
 	orig_masked_frames = np.array(input_frames_masked)
 	input_frames = np.array(input_frames_masked) / 255.
-	print("Input images full: ", input_frames.shape)      	# num_framesx270x480x3
 	input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
-	print("Input images window: ", input_frames.shape)      	# Tx25x270x480x3
 	num_frames = input_frames.shape[0]
@@ -420,7 +428,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
 	return input_frames, num_frames, orig_masked_frames, "success"
-def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
 	'''
 	This function extracts the spectrogram from the audio file
@@ -457,6 +465,10 @@ def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
 		if frame_diff > 60:
 			print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
 	return spec, orig_spec, "success"
@@ -624,7 +636,7 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
 	if offset == 0:
 		print("The input audio and video are in-sync! No need to perform sync correction.")
-		return video_path
 	print("Performing Sync Correction...")
 	corrected_frames = np.zeros_like(frames)
@@ -682,7 +694,7 @@ def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_fold
 		print("Successfully extracted the keypoints")
 		# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
-		masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict)
 		if status != "success":
 			return None, None, status
 		print("Successfully loaded the masked frames")
@@ -806,6 +818,8 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
 		if global_score=="True":
 			score = output.mean(0)
 		else:
 			output_batch = output.unfold(0, num_avg_frames, 1)
 			score = torch.mean(output_batch, axis=-1)
@@ -823,7 +837,7 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
 			pred_idx = np.argmax(score)
 			pred_speaker.append(pred_idx)
-	return pred_speaker
 def save_video(output_tracks, input_frames, wav_file, result_folder):
@@ -887,7 +901,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		print("Applying preprocessing: ", apply_preprocess)
 		wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
 		if status != "success":
-			return status, None
 		print("Successfully preprocessed the video")
 		# Resample the video to 25 fps if it is not already 25 fps
@@ -895,10 +909,10 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		if fps!=25:
 			vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
 			if status != "success":
-				return status, None
 			orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
 			if status != "success":
-				return status, None
 		else:
 			vid_path = vid_path_processed
 			orig_vid_path_25fps = video_path
@@ -906,31 +920,32 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Load the original video frames (before pre-processing) - Needed for the final sync-correction
 		orig_frames, status = load_video_frames(orig_vid_path_25fps)
 		if status != "success":
-			return status, None
 		# Load the pre-processed video frames
 		frames, status = load_video_frames(vid_path)
 		if status != "success":
-			return status, None
 		print("Successfully extracted the video frames")
 		if len(frames) < num_avg_frames:
-			return "Error: The input video is too short. Please use a longer input video.", None
 		# Load keypoints and check if gestures are visible
 		kp_dict, status = get_keypoints(frames)
 		if status != "success":
-			return status, None
 		print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))
 		status = check_visible_gestures(kp_dict)
 		if status != "success":
-			return status, None
 		# Load RGB frames
-		rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, window_frames=25, width=480, height=270)
 		if status != "success":
-			return status, None
 		print("Successfully loaded the RGB frames")
 		# Convert frames to tensor
@@ -940,9 +955,9 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		print("Successfully converted the frames to tensor")
 		# Load spectrograms
-		spec, orig_spec, status = load_spectrograms(wav_file, num_frames, window_frames=25)
 		if status != "success":
-			return status, None
 		spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
 		print("Successfully loaded the spectrograms")
@@ -993,19 +1008,19 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
 		# Calculate sync offset
 		pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
 		if status != "success":
-			return status, None
 		print("Predicted offset: ", pred_offset)
 		# Generate sync-corrected video
 		video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
 		if status != "success":
-			return status, None
 		print("Successfully generated the video:", video_output)
-		return f"Predicted offset: {pred_offset}", video_output
 	except Exception as e:
-		return f"Error: {str(e)}", None
 def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 	try:
@@ -1026,14 +1041,14 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		if global_speaker=="per-frame-prediction" and num_avg_frames<25:
 			msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
-			return msg, None
 		# Read the video
 		try:
 			vr = VideoReader(video_path, ctx=cpu(0))
 		except:
 			msg = "Oops! Could not load the input video file"
-			return msg, None
 		# Get the FPS of the video
 		fps = vr.get_avg_fps()
@@ -1043,25 +1058,26 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		if fps!=25:
 			test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
 			if status != "success":
-				return status, None
 		else:
 			test_video_25fps = video_path
 		# Load the video frames
 		orig_frames, status = load_video_frames(test_video_25fps)
 		if status != "success":
-			return status, None
 		# Extract and save the audio file
 		orig_wav_file, status = extract_audio(video_path, result_folder)
 		if status != "success":
-			return status, None
 		# Pre-process and extract per-speaker tracks in each scene
 		print("Pre-processing the input video...")
 		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
 		if status != 0:
-			return "Error in pre-processing the input video, please check the input video and try again...", None
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
@@ -1094,20 +1110,20 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 			if len(test_videos)<=1:
 				msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
-				return msg, None
 			# Load the audio file
 			audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
-			spec, _, status = load_spectrograms(audio_file, window_frames=25)
 			if status != "success":
-				return status, None
 			spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
 			print("Successfully loaded the spectrograms")
 			# Load the masked input frames
 			all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
 			if status != "success":
-				return status, None
 			print("Successfully loaded the masked input frames")
 			# Prepare the audio and video sequences for the model
@@ -1128,9 +1144,9 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 			# Predict the active speaker in each scene
 			if global_speaker=="per-frame-prediction":
-				predictions = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
 			else:
-				predictions = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)
 			# Get the frames present in the scene
 			frames_scene = tracks[scene_num][0]['track']['frame']
@@ -1152,9 +1168,10 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 					frame_pred = len(frames_scene)-(mid*2)
 					start, end = mid, len(frames_scene)-mid
 				if len(predictions) != frame_pred:
 					msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
-					return msg, None
 				active_speakers[start:end] = predictions[0:]
@@ -1176,13 +1193,13 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 		# Save the output video
 		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
 		if status != "success":
-			return status, None
 		print("Successfully saved the output video: ", video_output)
-		return "success", video_output
 	except Exception as e:
-		return f"Error: {str(e)}", None
 if __name__ == "__main__":
@@ -1272,8 +1289,23 @@ if __name__ == "__main__":
 	<div class="header">
 		<h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
 		<h2>Synchronization and Active Speaker Detection Demo</h2>
-		<p>Sindhu Hegde and Andrew Zisserman</p>
-		<p>VGG, University of Oxford</p>
 	</div>
 	"""
@@ -1291,12 +1323,13 @@ if __name__ == "__main__":
 				gr.update(value=75, visible=True),  # num_avg_frames
 				gr.update(value=None, visible=True),  # apply_preprocess
 				gr.update(value="global-prediction", visible=False), # global_speaker
-				gr.update(value="", visible=True),  # result_text
 				gr.update(value=None, visible=True),  # output_video
 				gr.update(visible=True),  # submit_button
 				gr.update(visible=True),  # clear_button
 				gr.update(visible=True),  # sync_examples
-				gr.update(visible=False)  # asd_examples
 			)
 		else:
 			return (
@@ -1304,12 +1337,13 @@ if __name__ == "__main__":
 				gr.update(value=75, visible=True), # num_avg_frames
 				gr.update(value=None, visible=False), # apply_preprocess
 				gr.update(value="global-prediction", visible=True),  # global_speaker
-				gr.update(value="", visible=True),  # result_text
 				gr.update(value=None, visible=True),  # output_video
 				gr.update(visible=True),  # submit_button
 				gr.update(visible=True),  # clear_button
 				gr.update(visible=False), # sync_examples
-				gr.update(visible=True)     # asd_examples
 			)
 	def clear_inputs():
@@ -1363,8 +1397,8 @@ if __name__ == "__main__":
 					outputs=num_avg_frames
 				)
 			with gr.Column():
-				result_text = gr.Textbox(label="Result", visible=False)
 				output_video = gr.Video(label="Output Video", height=400, visible=False)
 		with gr.Row():
 			submit_button = gr.Button("Submit", variant="primary", visible=False)
@@ -1389,10 +1423,13 @@ if __name__ == "__main__":
 			visible=False
 		)
 		demo_choice.change(
 			fn=toggle_demo,
 			inputs=demo_choice,
-			outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, result_text, output_video, submit_button, clear_button, sync_examples, asd_examples]
 		)
 		sync_examples.select(
@@ -1411,7 +1448,7 @@ if __name__ == "__main__":
 		submit_button.click(
 			fn=process_video,
 			inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
-			outputs=[result_text, output_video]
 		)
 		clear_button.click(
@@ -1420,5 +1457,6 @@ if __name__ == "__main__":
 			outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
 		)
 	# Launch the interface
 	demo.launch(allowed_paths=["."], share=True)

 	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
 	# Resample the video to 25 fps
 	status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
 	if status != 0:
 		msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
 			hand_count += 1
+	if hand_count/len(keypoints) > 0.6 or pose_count/len(keypoints) > 0.6:
 		msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
 		return msg
 	return "success"
+def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_frames=25, width=480, height=270):
 	'''
 	This function masks the faces using the keypoints extracted from the frames
 		- msg (string) : Message to be returned
 	'''
 	print("Creating masked input frames...")
+	input_frames_masked = []
+	if kp_dict is None:
+		for img in tqdm(input_frames):
 			img = cv2.resize(img, (width, height))
 			masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
+			input_frames_masked.append(masked_img)
+	else:
+		# Face indices to extract the face-coordinates needed for masking
+		face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172,
+						176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]
+		input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
+		print("Input keypoints: ", len(input_keypoints))
+		for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):
+			img = input_frames[i]
+			face = frame_kp_dict["face"]
+			if face is None:
+				img = cv2.resize(img, (width, height))
+				masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
+			else:
+				face_kps = []
+				for idx in range(len(face)):
+					if idx in face_oval_idx:
+						x, y = int(face[idx]["x"]*resolution[1]), int(face[idx]["y"]*resolution[0])
+						face_kps.append((x,y))
+				face_kps = np.array(face_kps)
+				x1, y1 = min(face_kps[:,0]), min(face_kps[:,1])
+				x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
+				masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)
+			if masked_img.shape[0] != width or masked_img.shape[1] != height:
+				masked_img = cv2.resize(masked_img, (width, height))
+			input_frames_masked.append(masked_img)
 	orig_masked_frames = np.array(input_frames_masked)
 	input_frames = np.array(input_frames_masked) / 255.
+	if asd:
+		input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
+	# print("Input images full: ", input_frames.shape)      	# num_framesx270x480x3
 	input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
+	# print("Input images window: ", input_frames.shape)      	# Tx25x270x480x3
 	num_frames = input_frames.shape[0]
 	return input_frames, num_frames, orig_masked_frames, "success"
+def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, stride=4):
 	'''
 	This function extracts the spectrogram from the audio file
 		if frame_diff > 60:
 			print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
+	if asd:
+		pad_frames = (window_frames//2)
+		spec = np.pad(spec, ((pad_frames, pad_frames), (0,0), (0,0)), 'edge')
 	return spec, orig_spec, "success"
 	if offset == 0:
 		print("The input audio and video are in-sync! No need to perform sync correction.")
+		return video_path, "success"
 	print("Performing Sync Correction...")
 	corrected_frames = np.zeros_like(frames)
 		print("Successfully extracted the keypoints")
 		# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
+		masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=True)
 		if status != "success":
 			return None, None, status
 		print("Successfully loaded the masked frames")
 		if global_score=="True":
 			score = output.mean(0)
 		else:
+			if output.shape[0]<num_avg_frames:
+				num_avg_frames = output.shape[0]
 			output_batch = output.unfold(0, num_avg_frames, 1)
 			score = torch.mean(output_batch, axis=-1)
 			pred_idx = np.argmax(score)
 			pred_speaker.append(pred_idx)
+	return pred_speaker, num_avg_frames
 def save_video(output_tracks, input_frames, wav_file, result_folder):
 		print("Applying preprocessing: ", apply_preprocess)
 		wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
 		if status != "success":
+			return None, status
 		print("Successfully preprocessed the video")
 		# Resample the video to 25 fps if it is not already 25 fps
 		if fps!=25:
 			vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
 			if status != "success":
+				return None, status
 			orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
 			if status != "success":
+				return None, status
 		else:
 			vid_path = vid_path_processed
 			orig_vid_path_25fps = video_path
 		# Load the original video frames (before pre-processing) - Needed for the final sync-correction
 		orig_frames, status = load_video_frames(orig_vid_path_25fps)
 		if status != "success":
+			return None, status
 		# Load the pre-processed video frames
 		frames, status = load_video_frames(vid_path)
 		if status != "success":
+			return None, status
 		print("Successfully extracted the video frames")
 		if len(frames) < num_avg_frames:
+			msg = "Error: The input video is too short. Please use a longer input video."
+			return None, msg
 		# Load keypoints and check if gestures are visible
 		kp_dict, status = get_keypoints(frames)
 		if status != "success":
+			return None, status
 		print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))
 		status = check_visible_gestures(kp_dict)
 		if status != "success":
+			return None, status
 		# Load RGB frames
+		rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
 		if status != "success":
+			return None, status
 		print("Successfully loaded the RGB frames")
 		# Convert frames to tensor
 		print("Successfully converted the frames to tensor")
 		# Load spectrograms
+		spec, orig_spec, status = load_spectrograms(wav_file, asd=False, num_frames=num_frames)
 		if status != "success":
+			return None, status
 		spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
 		print("Successfully loaded the spectrograms")
 		# Calculate sync offset
 		pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
 		if status != "success":
+			return None, status
 		print("Predicted offset: ", pred_offset)
 		# Generate sync-corrected video
 		video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
 		if status != "success":
+			return None, status
 		print("Successfully generated the video:", video_output)
+		return video_output, f"Predicted offset: {pred_offset}"
 	except Exception as e:
+		return None, f"Error: {str(e)}"
 def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
 	try:
 		if global_speaker=="per-frame-prediction" and num_avg_frames<25:
 			msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
+			return None, msg
 		# Read the video
 		try:
 			vr = VideoReader(video_path, ctx=cpu(0))
 		except:
 			msg = "Oops! Could not load the input video file"
+			return None, msg
 		# Get the FPS of the video
 		fps = vr.get_avg_fps()
 		if fps!=25:
 			test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
 			if status != "success":
+				return None, status
 		else:
 			test_video_25fps = video_path
 		# Load the video frames
 		orig_frames, status = load_video_frames(test_video_25fps)
 		if status != "success":
+			return None, status
 		# Extract and save the audio file
 		orig_wav_file, status = extract_audio(video_path, result_folder)
 		if status != "success":
+			return None, status
 		# Pre-process and extract per-speaker tracks in each scene
 		print("Pre-processing the input video...")
 		status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
 		if status != 0:
+			msg = "Error in pre-processing the input video, please check the input video and try again..."
+			return None, msg
 		# Load the tracks file saved during pre-processing
 		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 			if len(test_videos)<=1:
 				msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
+				return None, msg
 			# Load the audio file
 			audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
+			spec, _, status = load_spectrograms(audio_file, asd=True)
 			if status != "success":
+				return None, status
 			spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
 			print("Successfully loaded the spectrograms")
 			# Load the masked input frames
 			all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
 			if status != "success":
+				return None, status
 			print("Successfully loaded the masked input frames")
 			# Prepare the audio and video sequences for the model
 			# Predict the active speaker in each scene
 			if global_speaker=="per-frame-prediction":
+				predictions, num_avg_frames = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
 			else:
+				predictions, _ = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)
 			# Get the frames present in the scene
 			frames_scene = tracks[scene_num][0]['track']['frame']
 					frame_pred = len(frames_scene)-(mid*2)
 					start, end = mid, len(frames_scene)-mid
+				print("Frame scene: {} | Avg frames: {} | Frame predictions: {}".format(len(frames_scene), num_avg_frames, frame_pred))
 				if len(predictions) != frame_pred:
 					msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
+					return None, msg
 				active_speakers[start:end] = predictions[0:]
 		# Save the output video
 		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
 		if status != "success":
+			return None, status
 		print("Successfully saved the output video: ", video_output)
+		return video_output, "success"
 	except Exception as e:
+		return None, f"Error: {str(e)}"
 if __name__ == "__main__":
 	<div class="header">
 		<h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
 		<h2>Synchronization and Active Speaker Detection Demo</h2>
+		<p><a href='https://www.robots.ox.ac.uk/~vgg/research/gestsync/'>Project Page</a> | <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> | <a href='https://arxiv.org/abs/2310.05304'>Paper</a></p>
+	</div>
+	"""
+	tips = """
+	<div>
+	<br><br>
+	Please give us a 🌟 on <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> if you like our work!
+	Tips to get better results:
+	<ul>
+		<li>Number of Average Frames: Higher the number, better the results.</li>
+		<li>Clicking on "apply pre-processing" will give better results for synchornization, but this is an expensive operation and might take a while.</li>
+		<li>Input videos with clearly visible gestures work better.</li>
+	</ul>
 	</div>
 	"""
 				gr.update(value=75, visible=True),  # num_avg_frames
 				gr.update(value=None, visible=True),  # apply_preprocess
 				gr.update(value="global-prediction", visible=False), # global_speaker
 				gr.update(value=None, visible=True),  # output_video
+				gr.update(value="", visible=True),  # result_text
 				gr.update(visible=True),  # submit_button
 				gr.update(visible=True),  # clear_button
 				gr.update(visible=True),  # sync_examples
+				gr.update(visible=False),  # asd_examples
+				gr.update(visible=True)  # tips
 			)
 		else:
 			return (
 				gr.update(value=75, visible=True), # num_avg_frames
 				gr.update(value=None, visible=False), # apply_preprocess
 				gr.update(value="global-prediction", visible=True),  # global_speaker
 				gr.update(value=None, visible=True),  # output_video
+				gr.update(value="", visible=True),  # result_text
 				gr.update(visible=True),  # submit_button
 				gr.update(visible=True),  # clear_button
 				gr.update(visible=False), # sync_examples
+				gr.update(visible=True),   # asd_examples
+				gr.update(visible=True)  # tips
 			)
 	def clear_inputs():
 					outputs=num_avg_frames
 				)
 			with gr.Column():
 				output_video = gr.Video(label="Output Video", height=400, visible=False)
+				result_text = gr.Textbox(label="Result", visible=False)
 		with gr.Row():
 			submit_button = gr.Button("Submit", variant="primary", visible=False)
 			visible=False
 		)
+		tips = gr.Markdown(tips, visible=False)
 		demo_choice.change(
 			fn=toggle_demo,
 			inputs=demo_choice,
+			outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, output_video, result_text, submit_button, clear_button, sync_examples, asd_examples, tips]
 		)
 		sync_examples.select(
 		submit_button.click(
 			fn=process_video,
 			inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
+			outputs=[output_video, result_text]
 		)
 		clear_button.click(
 			outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
 		)
 	# Launch the interface
 	demo.launch(allowed_paths=["."], share=True)