Spaces:
Running
on
Zero
Running
on
Zero
sindhuhegde
commited on
Commit
β’
4b29652
1
Parent(s):
a0b74a7
Update app
Browse files
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
app.py
CHANGED
@@ -196,7 +196,6 @@ def resample_video(video_file, video_fname, result_folder):
|
|
196 |
video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
|
197 |
|
198 |
# Resample the video to 25 fps
|
199 |
-
# status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i {} -q:v 1 -filter:v fps=25 {}'.format(video_file, video_file_25fps), shell=True)
|
200 |
status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
|
201 |
if status != 0:
|
202 |
msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
|
@@ -343,7 +342,7 @@ def check_visible_gestures(kp_dict):
|
|
343 |
hand_count += 1
|
344 |
|
345 |
|
346 |
-
if hand_count/len(keypoints) > 0.
|
347 |
msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
|
348 |
return msg
|
349 |
|
@@ -351,7 +350,7 @@ def check_visible_gestures(kp_dict):
|
|
351 |
|
352 |
return "success"
|
353 |
|
354 |
-
def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, width=480, height=270):
|
355 |
|
356 |
'''
|
357 |
This function masks the faces using the keypoints extracted from the frames
|
@@ -370,47 +369,56 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
|
|
370 |
- msg (string) : Message to be returned
|
371 |
'''
|
372 |
|
373 |
-
# Face indices to extract the face-coordinates needed for masking
|
374 |
-
face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172,
|
375 |
-
176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]
|
376 |
-
|
377 |
-
|
378 |
-
input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
|
379 |
-
print("Input keypoints: ", len(input_keypoints))
|
380 |
-
|
381 |
print("Creating masked input frames...")
|
382 |
-
input_frames_masked = []
|
383 |
-
for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):
|
384 |
-
|
385 |
-
img = input_frames[i]
|
386 |
-
face = frame_kp_dict["face"]
|
387 |
|
388 |
-
|
|
|
|
|
389 |
img = cv2.resize(img, (width, height))
|
390 |
masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
-
|
399 |
-
|
400 |
-
x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
|
401 |
-
masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)
|
402 |
|
403 |
-
|
404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
|
406 |
-
|
407 |
|
408 |
orig_masked_frames = np.array(input_frames_masked)
|
409 |
input_frames = np.array(input_frames_masked) / 255.
|
410 |
-
|
|
|
|
|
411 |
|
412 |
input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
|
413 |
-
print("Input images window: ", input_frames.shape) # Tx25x270x480x3
|
414 |
|
415 |
num_frames = input_frames.shape[0]
|
416 |
|
@@ -420,7 +428,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
|
|
420 |
|
421 |
return input_frames, num_frames, orig_masked_frames, "success"
|
422 |
|
423 |
-
def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
|
424 |
|
425 |
'''
|
426 |
This function extracts the spectrogram from the audio file
|
@@ -457,6 +465,10 @@ def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
|
|
457 |
if frame_diff > 60:
|
458 |
print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
|
459 |
|
|
|
|
|
|
|
|
|
460 |
return spec, orig_spec, "success"
|
461 |
|
462 |
|
@@ -624,7 +636,7 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
|
|
624 |
|
625 |
if offset == 0:
|
626 |
print("The input audio and video are in-sync! No need to perform sync correction.")
|
627 |
-
return video_path
|
628 |
|
629 |
print("Performing Sync Correction...")
|
630 |
corrected_frames = np.zeros_like(frames)
|
@@ -682,7 +694,7 @@ def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_fold
|
|
682 |
print("Successfully extracted the keypoints")
|
683 |
|
684 |
# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
|
685 |
-
masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict)
|
686 |
if status != "success":
|
687 |
return None, None, status
|
688 |
print("Successfully loaded the masked frames")
|
@@ -806,6 +818,8 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
|
|
806 |
if global_score=="True":
|
807 |
score = output.mean(0)
|
808 |
else:
|
|
|
|
|
809 |
output_batch = output.unfold(0, num_avg_frames, 1)
|
810 |
score = torch.mean(output_batch, axis=-1)
|
811 |
|
@@ -823,7 +837,7 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
|
|
823 |
pred_idx = np.argmax(score)
|
824 |
pred_speaker.append(pred_idx)
|
825 |
|
826 |
-
return pred_speaker
|
827 |
|
828 |
|
829 |
def save_video(output_tracks, input_frames, wav_file, result_folder):
|
@@ -887,7 +901,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
887 |
print("Applying preprocessing: ", apply_preprocess)
|
888 |
wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
|
889 |
if status != "success":
|
890 |
-
return
|
891 |
print("Successfully preprocessed the video")
|
892 |
|
893 |
# Resample the video to 25 fps if it is not already 25 fps
|
@@ -895,10 +909,10 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
895 |
if fps!=25:
|
896 |
vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
|
897 |
if status != "success":
|
898 |
-
return
|
899 |
orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
|
900 |
if status != "success":
|
901 |
-
return
|
902 |
else:
|
903 |
vid_path = vid_path_processed
|
904 |
orig_vid_path_25fps = video_path
|
@@ -906,31 +920,32 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
906 |
# Load the original video frames (before pre-processing) - Needed for the final sync-correction
|
907 |
orig_frames, status = load_video_frames(orig_vid_path_25fps)
|
908 |
if status != "success":
|
909 |
-
return
|
910 |
|
911 |
# Load the pre-processed video frames
|
912 |
frames, status = load_video_frames(vid_path)
|
913 |
if status != "success":
|
914 |
-
return
|
915 |
print("Successfully extracted the video frames")
|
916 |
|
917 |
if len(frames) < num_avg_frames:
|
918 |
-
|
|
|
919 |
|
920 |
# Load keypoints and check if gestures are visible
|
921 |
kp_dict, status = get_keypoints(frames)
|
922 |
if status != "success":
|
923 |
-
return
|
924 |
print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))
|
925 |
|
926 |
status = check_visible_gestures(kp_dict)
|
927 |
if status != "success":
|
928 |
-
return
|
929 |
|
930 |
# Load RGB frames
|
931 |
-
rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, window_frames=25, width=480, height=270)
|
932 |
if status != "success":
|
933 |
-
return
|
934 |
print("Successfully loaded the RGB frames")
|
935 |
|
936 |
# Convert frames to tensor
|
@@ -940,9 +955,9 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
940 |
print("Successfully converted the frames to tensor")
|
941 |
|
942 |
# Load spectrograms
|
943 |
-
spec, orig_spec, status = load_spectrograms(wav_file,
|
944 |
if status != "success":
|
945 |
-
return
|
946 |
spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
|
947 |
print("Successfully loaded the spectrograms")
|
948 |
|
@@ -993,19 +1008,19 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
|
|
993 |
# Calculate sync offset
|
994 |
pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
|
995 |
if status != "success":
|
996 |
-
return
|
997 |
print("Predicted offset: ", pred_offset)
|
998 |
|
999 |
# Generate sync-corrected video
|
1000 |
video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
|
1001 |
if status != "success":
|
1002 |
-
return
|
1003 |
print("Successfully generated the video:", video_output)
|
1004 |
|
1005 |
-
return f"Predicted offset: {pred_offset}"
|
1006 |
|
1007 |
except Exception as e:
|
1008 |
-
return f"Error: {str(e)}"
|
1009 |
|
1010 |
def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
1011 |
try:
|
@@ -1026,14 +1041,14 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1026 |
|
1027 |
if global_speaker=="per-frame-prediction" and num_avg_frames<25:
|
1028 |
msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
|
1029 |
-
return
|
1030 |
|
1031 |
# Read the video
|
1032 |
try:
|
1033 |
vr = VideoReader(video_path, ctx=cpu(0))
|
1034 |
except:
|
1035 |
msg = "Oops! Could not load the input video file"
|
1036 |
-
return
|
1037 |
|
1038 |
# Get the FPS of the video
|
1039 |
fps = vr.get_avg_fps()
|
@@ -1043,25 +1058,26 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1043 |
if fps!=25:
|
1044 |
test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
|
1045 |
if status != "success":
|
1046 |
-
return
|
1047 |
else:
|
1048 |
test_video_25fps = video_path
|
1049 |
|
1050 |
# Load the video frames
|
1051 |
orig_frames, status = load_video_frames(test_video_25fps)
|
1052 |
if status != "success":
|
1053 |
-
return
|
1054 |
|
1055 |
# Extract and save the audio file
|
1056 |
orig_wav_file, status = extract_audio(video_path, result_folder)
|
1057 |
if status != "success":
|
1058 |
-
return
|
1059 |
|
1060 |
# Pre-process and extract per-speaker tracks in each scene
|
1061 |
print("Pre-processing the input video...")
|
1062 |
status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
|
1063 |
if status != 0:
|
1064 |
-
|
|
|
1065 |
|
1066 |
# Load the tracks file saved during pre-processing
|
1067 |
with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
|
@@ -1094,20 +1110,20 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1094 |
|
1095 |
if len(test_videos)<=1:
|
1096 |
msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
|
1097 |
-
return
|
1098 |
|
1099 |
# Load the audio file
|
1100 |
audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
|
1101 |
-
spec, _, status = load_spectrograms(audio_file,
|
1102 |
if status != "success":
|
1103 |
-
return
|
1104 |
spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
|
1105 |
print("Successfully loaded the spectrograms")
|
1106 |
|
1107 |
# Load the masked input frames
|
1108 |
all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
|
1109 |
if status != "success":
|
1110 |
-
return
|
1111 |
print("Successfully loaded the masked input frames")
|
1112 |
|
1113 |
# Prepare the audio and video sequences for the model
|
@@ -1128,9 +1144,9 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1128 |
|
1129 |
# Predict the active speaker in each scene
|
1130 |
if global_speaker=="per-frame-prediction":
|
1131 |
-
predictions = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
|
1132 |
else:
|
1133 |
-
predictions = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)
|
1134 |
|
1135 |
# Get the frames present in the scene
|
1136 |
frames_scene = tracks[scene_num][0]['track']['frame']
|
@@ -1152,9 +1168,10 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1152 |
frame_pred = len(frames_scene)-(mid*2)
|
1153 |
start, end = mid, len(frames_scene)-mid
|
1154 |
|
|
|
1155 |
if len(predictions) != frame_pred:
|
1156 |
msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
|
1157 |
-
return
|
1158 |
|
1159 |
active_speakers[start:end] = predictions[0:]
|
1160 |
|
@@ -1176,13 +1193,13 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
|
1176 |
# Save the output video
|
1177 |
video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
|
1178 |
if status != "success":
|
1179 |
-
return
|
1180 |
print("Successfully saved the output video: ", video_output)
|
1181 |
|
1182 |
-
return "success"
|
1183 |
|
1184 |
except Exception as e:
|
1185 |
-
return f"Error: {str(e)}"
|
1186 |
|
1187 |
if __name__ == "__main__":
|
1188 |
|
@@ -1272,8 +1289,23 @@ if __name__ == "__main__":
|
|
1272 |
<div class="header">
|
1273 |
<h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
|
1274 |
<h2>Synchronization and Active Speaker Detection Demo</h2>
|
1275 |
-
<p>Sindhu
|
1276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1277 |
</div>
|
1278 |
"""
|
1279 |
|
@@ -1291,12 +1323,13 @@ if __name__ == "__main__":
|
|
1291 |
gr.update(value=75, visible=True), # num_avg_frames
|
1292 |
gr.update(value=None, visible=True), # apply_preprocess
|
1293 |
gr.update(value="global-prediction", visible=False), # global_speaker
|
1294 |
-
gr.update(value="", visible=True), # result_text
|
1295 |
gr.update(value=None, visible=True), # output_video
|
|
|
1296 |
gr.update(visible=True), # submit_button
|
1297 |
gr.update(visible=True), # clear_button
|
1298 |
gr.update(visible=True), # sync_examples
|
1299 |
-
gr.update(visible=False) # asd_examples
|
|
|
1300 |
)
|
1301 |
else:
|
1302 |
return (
|
@@ -1304,12 +1337,13 @@ if __name__ == "__main__":
|
|
1304 |
gr.update(value=75, visible=True), # num_avg_frames
|
1305 |
gr.update(value=None, visible=False), # apply_preprocess
|
1306 |
gr.update(value="global-prediction", visible=True), # global_speaker
|
1307 |
-
gr.update(value="", visible=True), # result_text
|
1308 |
gr.update(value=None, visible=True), # output_video
|
|
|
1309 |
gr.update(visible=True), # submit_button
|
1310 |
gr.update(visible=True), # clear_button
|
1311 |
gr.update(visible=False), # sync_examples
|
1312 |
-
gr.update(visible=True)
|
|
|
1313 |
)
|
1314 |
|
1315 |
def clear_inputs():
|
@@ -1363,8 +1397,8 @@ if __name__ == "__main__":
|
|
1363 |
outputs=num_avg_frames
|
1364 |
)
|
1365 |
with gr.Column():
|
1366 |
-
result_text = gr.Textbox(label="Result", visible=False)
|
1367 |
output_video = gr.Video(label="Output Video", height=400, visible=False)
|
|
|
1368 |
|
1369 |
with gr.Row():
|
1370 |
submit_button = gr.Button("Submit", variant="primary", visible=False)
|
@@ -1389,10 +1423,13 @@ if __name__ == "__main__":
|
|
1389 |
visible=False
|
1390 |
)
|
1391 |
|
|
|
|
|
|
|
1392 |
demo_choice.change(
|
1393 |
fn=toggle_demo,
|
1394 |
inputs=demo_choice,
|
1395 |
-
outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker,
|
1396 |
)
|
1397 |
|
1398 |
sync_examples.select(
|
@@ -1411,7 +1448,7 @@ if __name__ == "__main__":
|
|
1411 |
submit_button.click(
|
1412 |
fn=process_video,
|
1413 |
inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
|
1414 |
-
outputs=[
|
1415 |
)
|
1416 |
|
1417 |
clear_button.click(
|
@@ -1420,5 +1457,6 @@ if __name__ == "__main__":
|
|
1420 |
outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
|
1421 |
)
|
1422 |
|
|
|
1423 |
# Launch the interface
|
1424 |
demo.launch(allowed_paths=["."], share=True)
|
|
|
196 |
video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
|
197 |
|
198 |
# Resample the video to 25 fps
|
|
|
199 |
status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
|
200 |
if status != 0:
|
201 |
msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
|
|
|
342 |
hand_count += 1
|
343 |
|
344 |
|
345 |
+
if hand_count/len(keypoints) > 0.6 or pose_count/len(keypoints) > 0.6:
|
346 |
msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
|
347 |
return msg
|
348 |
|
|
|
350 |
|
351 |
return "success"
|
352 |
|
353 |
+
def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_frames=25, width=480, height=270):
|
354 |
|
355 |
'''
|
356 |
This function masks the faces using the keypoints extracted from the frames
|
|
|
369 |
- msg (string) : Message to be returned
|
370 |
'''
|
371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
print("Creating masked input frames...")
|
|
|
|
|
|
|
|
|
|
|
373 |
|
374 |
+
input_frames_masked = []
|
375 |
+
if kp_dict is None:
|
376 |
+
for img in tqdm(input_frames):
|
377 |
img = cv2.resize(img, (width, height))
|
378 |
masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
|
379 |
+
input_frames_masked.append(masked_img)
|
380 |
+
|
381 |
+
else:
|
382 |
+
# Face indices to extract the face-coordinates needed for masking
|
383 |
+
face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172,
|
384 |
+
176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]
|
385 |
+
|
386 |
+
input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
|
387 |
+
print("Input keypoints: ", len(input_keypoints))
|
388 |
+
|
389 |
+
for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):
|
390 |
|
391 |
+
img = input_frames[i]
|
392 |
+
face = frame_kp_dict["face"]
|
|
|
|
|
393 |
|
394 |
+
if face is None:
|
395 |
+
img = cv2.resize(img, (width, height))
|
396 |
+
masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
|
397 |
+
else:
|
398 |
+
face_kps = []
|
399 |
+
for idx in range(len(face)):
|
400 |
+
if idx in face_oval_idx:
|
401 |
+
x, y = int(face[idx]["x"]*resolution[1]), int(face[idx]["y"]*resolution[0])
|
402 |
+
face_kps.append((x,y))
|
403 |
+
|
404 |
+
face_kps = np.array(face_kps)
|
405 |
+
x1, y1 = min(face_kps[:,0]), min(face_kps[:,1])
|
406 |
+
x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
|
407 |
+
masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)
|
408 |
+
|
409 |
+
if masked_img.shape[0] != width or masked_img.shape[1] != height:
|
410 |
+
masked_img = cv2.resize(masked_img, (width, height))
|
411 |
|
412 |
+
input_frames_masked.append(masked_img)
|
413 |
|
414 |
orig_masked_frames = np.array(input_frames_masked)
|
415 |
input_frames = np.array(input_frames_masked) / 255.
|
416 |
+
if asd:
|
417 |
+
input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
|
418 |
+
# print("Input images full: ", input_frames.shape) # num_framesx270x480x3
|
419 |
|
420 |
input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
|
421 |
+
# print("Input images window: ", input_frames.shape) # Tx25x270x480x3
|
422 |
|
423 |
num_frames = input_frames.shape[0]
|
424 |
|
|
|
428 |
|
429 |
return input_frames, num_frames, orig_masked_frames, "success"
|
430 |
|
431 |
+
def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, stride=4):
|
432 |
|
433 |
'''
|
434 |
This function extracts the spectrogram from the audio file
|
|
|
465 |
if frame_diff > 60:
|
466 |
print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
|
467 |
|
468 |
+
if asd:
|
469 |
+
pad_frames = (window_frames//2)
|
470 |
+
spec = np.pad(spec, ((pad_frames, pad_frames), (0,0), (0,0)), 'edge')
|
471 |
+
|
472 |
return spec, orig_spec, "success"
|
473 |
|
474 |
|
|
|
636 |
|
637 |
if offset == 0:
|
638 |
print("The input audio and video are in-sync! No need to perform sync correction.")
|
639 |
+
return video_path, "success"
|
640 |
|
641 |
print("Performing Sync Correction...")
|
642 |
corrected_frames = np.zeros_like(frames)
|
|
|
694 |
print("Successfully extracted the keypoints")
|
695 |
|
696 |
# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
|
697 |
+
masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=True)
|
698 |
if status != "success":
|
699 |
return None, None, status
|
700 |
print("Successfully loaded the masked frames")
|
|
|
818 |
if global_score=="True":
|
819 |
score = output.mean(0)
|
820 |
else:
|
821 |
+
if output.shape[0]<num_avg_frames:
|
822 |
+
num_avg_frames = output.shape[0]
|
823 |
output_batch = output.unfold(0, num_avg_frames, 1)
|
824 |
score = torch.mean(output_batch, axis=-1)
|
825 |
|
|
|
837 |
pred_idx = np.argmax(score)
|
838 |
pred_speaker.append(pred_idx)
|
839 |
|
840 |
+
return pred_speaker, num_avg_frames
|
841 |
|
842 |
|
843 |
def save_video(output_tracks, input_frames, wav_file, result_folder):
|
|
|
901 |
print("Applying preprocessing: ", apply_preprocess)
|
902 |
wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
|
903 |
if status != "success":
|
904 |
+
return None, status
|
905 |
print("Successfully preprocessed the video")
|
906 |
|
907 |
# Resample the video to 25 fps if it is not already 25 fps
|
|
|
909 |
if fps!=25:
|
910 |
vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
|
911 |
if status != "success":
|
912 |
+
return None, status
|
913 |
orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
|
914 |
if status != "success":
|
915 |
+
return None, status
|
916 |
else:
|
917 |
vid_path = vid_path_processed
|
918 |
orig_vid_path_25fps = video_path
|
|
|
920 |
# Load the original video frames (before pre-processing) - Needed for the final sync-correction
|
921 |
orig_frames, status = load_video_frames(orig_vid_path_25fps)
|
922 |
if status != "success":
|
923 |
+
return None, status
|
924 |
|
925 |
# Load the pre-processed video frames
|
926 |
frames, status = load_video_frames(vid_path)
|
927 |
if status != "success":
|
928 |
+
return None, status
|
929 |
print("Successfully extracted the video frames")
|
930 |
|
931 |
if len(frames) < num_avg_frames:
|
932 |
+
msg = "Error: The input video is too short. Please use a longer input video."
|
933 |
+
return None, msg
|
934 |
|
935 |
# Load keypoints and check if gestures are visible
|
936 |
kp_dict, status = get_keypoints(frames)
|
937 |
if status != "success":
|
938 |
+
return None, status
|
939 |
print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))
|
940 |
|
941 |
status = check_visible_gestures(kp_dict)
|
942 |
if status != "success":
|
943 |
+
return None, status
|
944 |
|
945 |
# Load RGB frames
|
946 |
+
rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
|
947 |
if status != "success":
|
948 |
+
return None, status
|
949 |
print("Successfully loaded the RGB frames")
|
950 |
|
951 |
# Convert frames to tensor
|
|
|
955 |
print("Successfully converted the frames to tensor")
|
956 |
|
957 |
# Load spectrograms
|
958 |
+
spec, orig_spec, status = load_spectrograms(wav_file, asd=False, num_frames=num_frames)
|
959 |
if status != "success":
|
960 |
+
return None, status
|
961 |
spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
|
962 |
print("Successfully loaded the spectrograms")
|
963 |
|
|
|
1008 |
# Calculate sync offset
|
1009 |
pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
|
1010 |
if status != "success":
|
1011 |
+
return None, status
|
1012 |
print("Predicted offset: ", pred_offset)
|
1013 |
|
1014 |
# Generate sync-corrected video
|
1015 |
video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
|
1016 |
if status != "success":
|
1017 |
+
return None, status
|
1018 |
print("Successfully generated the video:", video_output)
|
1019 |
|
1020 |
+
return video_output, f"Predicted offset: {pred_offset}"
|
1021 |
|
1022 |
except Exception as e:
|
1023 |
+
return None, f"Error: {str(e)}"
|
1024 |
|
1025 |
def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
|
1026 |
try:
|
|
|
1041 |
|
1042 |
if global_speaker=="per-frame-prediction" and num_avg_frames<25:
|
1043 |
msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
|
1044 |
+
return None, msg
|
1045 |
|
1046 |
# Read the video
|
1047 |
try:
|
1048 |
vr = VideoReader(video_path, ctx=cpu(0))
|
1049 |
except:
|
1050 |
msg = "Oops! Could not load the input video file"
|
1051 |
+
return None, msg
|
1052 |
|
1053 |
# Get the FPS of the video
|
1054 |
fps = vr.get_avg_fps()
|
|
|
1058 |
if fps!=25:
|
1059 |
test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
|
1060 |
if status != "success":
|
1061 |
+
return None, status
|
1062 |
else:
|
1063 |
test_video_25fps = video_path
|
1064 |
|
1065 |
# Load the video frames
|
1066 |
orig_frames, status = load_video_frames(test_video_25fps)
|
1067 |
if status != "success":
|
1068 |
+
return None, status
|
1069 |
|
1070 |
# Extract and save the audio file
|
1071 |
orig_wav_file, status = extract_audio(video_path, result_folder)
|
1072 |
if status != "success":
|
1073 |
+
return None, status
|
1074 |
|
1075 |
# Pre-process and extract per-speaker tracks in each scene
|
1076 |
print("Pre-processing the input video...")
|
1077 |
status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
|
1078 |
if status != 0:
|
1079 |
+
msg = "Error in pre-processing the input video, please check the input video and try again..."
|
1080 |
+
return None, msg
|
1081 |
|
1082 |
# Load the tracks file saved during pre-processing
|
1083 |
with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
|
|
|
1110 |
|
1111 |
if len(test_videos)<=1:
|
1112 |
msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
|
1113 |
+
return None, msg
|
1114 |
|
1115 |
# Load the audio file
|
1116 |
audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
|
1117 |
+
spec, _, status = load_spectrograms(audio_file, asd=True)
|
1118 |
if status != "success":
|
1119 |
+
return None, status
|
1120 |
spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
|
1121 |
print("Successfully loaded the spectrograms")
|
1122 |
|
1123 |
# Load the masked input frames
|
1124 |
all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
|
1125 |
if status != "success":
|
1126 |
+
return None, status
|
1127 |
print("Successfully loaded the masked input frames")
|
1128 |
|
1129 |
# Prepare the audio and video sequences for the model
|
|
|
1144 |
|
1145 |
# Predict the active speaker in each scene
|
1146 |
if global_speaker=="per-frame-prediction":
|
1147 |
+
predictions, num_avg_frames = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
|
1148 |
else:
|
1149 |
+
predictions, _ = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)
|
1150 |
|
1151 |
# Get the frames present in the scene
|
1152 |
frames_scene = tracks[scene_num][0]['track']['frame']
|
|
|
1168 |
frame_pred = len(frames_scene)-(mid*2)
|
1169 |
start, end = mid, len(frames_scene)-mid
|
1170 |
|
1171 |
+
print("Frame scene: {} | Avg frames: {} | Frame predictions: {}".format(len(frames_scene), num_avg_frames, frame_pred))
|
1172 |
if len(predictions) != frame_pred:
|
1173 |
msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
|
1174 |
+
return None, msg
|
1175 |
|
1176 |
active_speakers[start:end] = predictions[0:]
|
1177 |
|
|
|
1193 |
# Save the output video
|
1194 |
video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
|
1195 |
if status != "success":
|
1196 |
+
return None, status
|
1197 |
print("Successfully saved the output video: ", video_output)
|
1198 |
|
1199 |
+
return video_output, "success"
|
1200 |
|
1201 |
except Exception as e:
|
1202 |
+
return None, f"Error: {str(e)}"
|
1203 |
|
1204 |
if __name__ == "__main__":
|
1205 |
|
|
|
1289 |
<div class="header">
|
1290 |
<h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
|
1291 |
<h2>Synchronization and Active Speaker Detection Demo</h2>
|
1292 |
+
<p><a href='https://www.robots.ox.ac.uk/~vgg/research/gestsync/'>Project Page</a> | <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> | <a href='https://arxiv.org/abs/2310.05304'>Paper</a></p>
|
1293 |
+
</div>
|
1294 |
+
"""
|
1295 |
+
|
1296 |
+
|
1297 |
+
tips = """
|
1298 |
+
<div>
|
1299 |
+
<br><br>
|
1300 |
+
Please give us a π on <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> if you like our work!
|
1301 |
+
|
1302 |
+
Tips to get better results:
|
1303 |
+
<ul>
|
1304 |
+
<li>Number of Average Frames: Higher the number, better the results.</li>
|
1305 |
+
<li>Clicking on "apply pre-processing" will give better results for synchornization, but this is an expensive operation and might take a while.</li>
|
1306 |
+
<li>Input videos with clearly visible gestures work better.</li>
|
1307 |
+
</ul>
|
1308 |
+
|
1309 |
</div>
|
1310 |
"""
|
1311 |
|
|
|
1323 |
gr.update(value=75, visible=True), # num_avg_frames
|
1324 |
gr.update(value=None, visible=True), # apply_preprocess
|
1325 |
gr.update(value="global-prediction", visible=False), # global_speaker
|
|
|
1326 |
gr.update(value=None, visible=True), # output_video
|
1327 |
+
gr.update(value="", visible=True), # result_text
|
1328 |
gr.update(visible=True), # submit_button
|
1329 |
gr.update(visible=True), # clear_button
|
1330 |
gr.update(visible=True), # sync_examples
|
1331 |
+
gr.update(visible=False), # asd_examples
|
1332 |
+
gr.update(visible=True) # tips
|
1333 |
)
|
1334 |
else:
|
1335 |
return (
|
|
|
1337 |
gr.update(value=75, visible=True), # num_avg_frames
|
1338 |
gr.update(value=None, visible=False), # apply_preprocess
|
1339 |
gr.update(value="global-prediction", visible=True), # global_speaker
|
|
|
1340 |
gr.update(value=None, visible=True), # output_video
|
1341 |
+
gr.update(value="", visible=True), # result_text
|
1342 |
gr.update(visible=True), # submit_button
|
1343 |
gr.update(visible=True), # clear_button
|
1344 |
gr.update(visible=False), # sync_examples
|
1345 |
+
gr.update(visible=True), # asd_examples
|
1346 |
+
gr.update(visible=True) # tips
|
1347 |
)
|
1348 |
|
1349 |
def clear_inputs():
|
|
|
1397 |
outputs=num_avg_frames
|
1398 |
)
|
1399 |
with gr.Column():
|
|
|
1400 |
output_video = gr.Video(label="Output Video", height=400, visible=False)
|
1401 |
+
result_text = gr.Textbox(label="Result", visible=False)
|
1402 |
|
1403 |
with gr.Row():
|
1404 |
submit_button = gr.Button("Submit", variant="primary", visible=False)
|
|
|
1423 |
visible=False
|
1424 |
)
|
1425 |
|
1426 |
+
tips = gr.Markdown(tips, visible=False)
|
1427 |
+
|
1428 |
+
|
1429 |
demo_choice.change(
|
1430 |
fn=toggle_demo,
|
1431 |
inputs=demo_choice,
|
1432 |
+
outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, output_video, result_text, submit_button, clear_button, sync_examples, asd_examples, tips]
|
1433 |
)
|
1434 |
|
1435 |
sync_examples.select(
|
|
|
1448 |
submit_button.click(
|
1449 |
fn=process_video,
|
1450 |
inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
|
1451 |
+
outputs=[output_video, result_text]
|
1452 |
)
|
1453 |
|
1454 |
clear_button.click(
|
|
|
1457 |
outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
|
1458 |
)
|
1459 |
|
1460 |
+
|
1461 |
# Launch the interface
|
1462 |
demo.launch(allowed_paths=["."], share=True)
|