sindhuhegde commited on
Commit
4b29652
β€’
1 Parent(s): a0b74a7

Update app

Browse files
Files changed (2) hide show
  1. .DS_Store +0 -0
  2. app.py +115 -77
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -196,7 +196,6 @@ def resample_video(video_file, video_fname, result_folder):
196
  video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
197
 
198
  # Resample the video to 25 fps
199
- # status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i {} -q:v 1 -filter:v fps=25 {}'.format(video_file, video_file_25fps), shell=True)
200
  status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
201
  if status != 0:
202
  msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
@@ -343,7 +342,7 @@ def check_visible_gestures(kp_dict):
343
  hand_count += 1
344
 
345
 
346
- if hand_count/len(keypoints) > 0.7 or pose_count/len(keypoints) > 0.7:
347
  msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
348
  return msg
349
 
@@ -351,7 +350,7 @@ def check_visible_gestures(kp_dict):
351
 
352
  return "success"
353
 
354
- def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, width=480, height=270):
355
 
356
  '''
357
  This function masks the faces using the keypoints extracted from the frames
@@ -370,47 +369,56 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
370
  - msg (string) : Message to be returned
371
  '''
372
 
373
- # Face indices to extract the face-coordinates needed for masking
374
- face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172,
375
- 176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]
376
-
377
-
378
- input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
379
- print("Input keypoints: ", len(input_keypoints))
380
-
381
  print("Creating masked input frames...")
382
- input_frames_masked = []
383
- for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):
384
-
385
- img = input_frames[i]
386
- face = frame_kp_dict["face"]
387
 
388
- if face is None:
 
 
389
  img = cv2.resize(img, (width, height))
390
  masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
391
- else:
392
- face_kps = []
393
- for idx in range(len(face)):
394
- if idx in face_oval_idx:
395
- x, y = int(face[idx]["x"]*resolution[1]), int(face[idx]["y"]*resolution[0])
396
- face_kps.append((x,y))
 
 
 
 
 
397
 
398
- face_kps = np.array(face_kps)
399
- x1, y1 = min(face_kps[:,0]), min(face_kps[:,1])
400
- x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
401
- masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)
402
 
403
- if masked_img.shape[0] != width or masked_img.shape[1] != height:
404
- masked_img = cv2.resize(masked_img, (width, height))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
- input_frames_masked.append(masked_img)
407
 
408
  orig_masked_frames = np.array(input_frames_masked)
409
  input_frames = np.array(input_frames_masked) / 255.
410
- print("Input images full: ", input_frames.shape) # num_framesx270x480x3
 
 
411
 
412
  input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
413
- print("Input images window: ", input_frames.shape) # Tx25x270x480x3
414
 
415
  num_frames = input_frames.shape[0]
416
 
@@ -420,7 +428,7 @@ def load_rgb_masked_frames(input_frames, kp_dict, stride=1, window_frames=25, wi
420
 
421
  return input_frames, num_frames, orig_masked_frames, "success"
422
 
423
- def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
424
 
425
  '''
426
  This function extracts the spectrogram from the audio file
@@ -457,6 +465,10 @@ def load_spectrograms(wav_file, num_frames=None, window_frames=25, stride=4):
457
  if frame_diff > 60:
458
  print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
459
 
 
 
 
 
460
  return spec, orig_spec, "success"
461
 
462
 
@@ -624,7 +636,7 @@ def sync_correct_video(video_path, frames, wav_file, offset, result_folder, samp
624
 
625
  if offset == 0:
626
  print("The input audio and video are in-sync! No need to perform sync correction.")
627
- return video_path
628
 
629
  print("Performing Sync Correction...")
630
  corrected_frames = np.zeros_like(frames)
@@ -682,7 +694,7 @@ def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_fold
682
  print("Successfully extracted the keypoints")
683
 
684
  # Mask the frames using the keypoints extracted from the frames and prepare the input to the model
685
- masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict)
686
  if status != "success":
687
  return None, None, status
688
  print("Successfully loaded the masked frames")
@@ -806,6 +818,8 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
806
  if global_score=="True":
807
  score = output.mean(0)
808
  else:
 
 
809
  output_batch = output.unfold(0, num_avg_frames, 1)
810
  score = torch.mean(output_batch, axis=-1)
811
 
@@ -823,7 +837,7 @@ def predict_active_speaker(all_video_embeddings, audio_embedding, global_score,
823
  pred_idx = np.argmax(score)
824
  pred_speaker.append(pred_idx)
825
 
826
- return pred_speaker
827
 
828
 
829
  def save_video(output_tracks, input_frames, wav_file, result_folder):
@@ -887,7 +901,7 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
887
  print("Applying preprocessing: ", apply_preprocess)
888
  wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
889
  if status != "success":
890
- return status, None
891
  print("Successfully preprocessed the video")
892
 
893
  # Resample the video to 25 fps if it is not already 25 fps
@@ -895,10 +909,10 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
895
  if fps!=25:
896
  vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
897
  if status != "success":
898
- return status, None
899
  orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
900
  if status != "success":
901
- return status, None
902
  else:
903
  vid_path = vid_path_processed
904
  orig_vid_path_25fps = video_path
@@ -906,31 +920,32 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
906
  # Load the original video frames (before pre-processing) - Needed for the final sync-correction
907
  orig_frames, status = load_video_frames(orig_vid_path_25fps)
908
  if status != "success":
909
- return status, None
910
 
911
  # Load the pre-processed video frames
912
  frames, status = load_video_frames(vid_path)
913
  if status != "success":
914
- return status, None
915
  print("Successfully extracted the video frames")
916
 
917
  if len(frames) < num_avg_frames:
918
- return "Error: The input video is too short. Please use a longer input video.", None
 
919
 
920
  # Load keypoints and check if gestures are visible
921
  kp_dict, status = get_keypoints(frames)
922
  if status != "success":
923
- return status, None
924
  print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))
925
 
926
  status = check_visible_gestures(kp_dict)
927
  if status != "success":
928
- return status, None
929
 
930
  # Load RGB frames
931
- rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, window_frames=25, width=480, height=270)
932
  if status != "success":
933
- return status, None
934
  print("Successfully loaded the RGB frames")
935
 
936
  # Convert frames to tensor
@@ -940,9 +955,9 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
940
  print("Successfully converted the frames to tensor")
941
 
942
  # Load spectrograms
943
- spec, orig_spec, status = load_spectrograms(wav_file, num_frames, window_frames=25)
944
  if status != "success":
945
- return status, None
946
  spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
947
  print("Successfully loaded the spectrograms")
948
 
@@ -993,19 +1008,19 @@ def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
993
  # Calculate sync offset
994
  pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
995
  if status != "success":
996
- return status, None
997
  print("Predicted offset: ", pred_offset)
998
 
999
  # Generate sync-corrected video
1000
  video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
1001
  if status != "success":
1002
- return status, None
1003
  print("Successfully generated the video:", video_output)
1004
 
1005
- return f"Predicted offset: {pred_offset}", video_output
1006
 
1007
  except Exception as e:
1008
- return f"Error: {str(e)}", None
1009
 
1010
  def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1011
  try:
@@ -1026,14 +1041,14 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1026
 
1027
  if global_speaker=="per-frame-prediction" and num_avg_frames<25:
1028
  msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
1029
- return msg, None
1030
 
1031
  # Read the video
1032
  try:
1033
  vr = VideoReader(video_path, ctx=cpu(0))
1034
  except:
1035
  msg = "Oops! Could not load the input video file"
1036
- return msg, None
1037
 
1038
  # Get the FPS of the video
1039
  fps = vr.get_avg_fps()
@@ -1043,25 +1058,26 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1043
  if fps!=25:
1044
  test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
1045
  if status != "success":
1046
- return status, None
1047
  else:
1048
  test_video_25fps = video_path
1049
 
1050
  # Load the video frames
1051
  orig_frames, status = load_video_frames(test_video_25fps)
1052
  if status != "success":
1053
- return status, None
1054
 
1055
  # Extract and save the audio file
1056
  orig_wav_file, status = extract_audio(video_path, result_folder)
1057
  if status != "success":
1058
- return status, None
1059
 
1060
  # Pre-process and extract per-speaker tracks in each scene
1061
  print("Pre-processing the input video...")
1062
  status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
1063
  if status != 0:
1064
- return "Error in pre-processing the input video, please check the input video and try again...", None
 
1065
 
1066
  # Load the tracks file saved during pre-processing
1067
  with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
@@ -1094,20 +1110,20 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1094
 
1095
  if len(test_videos)<=1:
1096
  msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
1097
- return msg, None
1098
 
1099
  # Load the audio file
1100
  audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
1101
- spec, _, status = load_spectrograms(audio_file, window_frames=25)
1102
  if status != "success":
1103
- return status, None
1104
  spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
1105
  print("Successfully loaded the spectrograms")
1106
 
1107
  # Load the masked input frames
1108
  all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
1109
  if status != "success":
1110
- return status, None
1111
  print("Successfully loaded the masked input frames")
1112
 
1113
  # Prepare the audio and video sequences for the model
@@ -1128,9 +1144,9 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1128
 
1129
  # Predict the active speaker in each scene
1130
  if global_speaker=="per-frame-prediction":
1131
- predictions = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
1132
  else:
1133
- predictions = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)
1134
 
1135
  # Get the frames present in the scene
1136
  frames_scene = tracks[scene_num][0]['track']['frame']
@@ -1152,9 +1168,10 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1152
  frame_pred = len(frames_scene)-(mid*2)
1153
  start, end = mid, len(frames_scene)-mid
1154
 
 
1155
  if len(predictions) != frame_pred:
1156
  msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
1157
- return msg, None
1158
 
1159
  active_speakers[start:end] = predictions[0:]
1160
 
@@ -1176,13 +1193,13 @@ def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1176
  # Save the output video
1177
  video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
1178
  if status != "success":
1179
- return status, None
1180
  print("Successfully saved the output video: ", video_output)
1181
 
1182
- return "success", video_output
1183
 
1184
  except Exception as e:
1185
- return f"Error: {str(e)}", None
1186
 
1187
  if __name__ == "__main__":
1188
 
@@ -1272,8 +1289,23 @@ if __name__ == "__main__":
1272
  <div class="header">
1273
  <h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
1274
  <h2>Synchronization and Active Speaker Detection Demo</h2>
1275
- <p>Sindhu Hegde and Andrew Zisserman</p>
1276
- <p>VGG, University of Oxford</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1277
  </div>
1278
  """
1279
 
@@ -1291,12 +1323,13 @@ if __name__ == "__main__":
1291
  gr.update(value=75, visible=True), # num_avg_frames
1292
  gr.update(value=None, visible=True), # apply_preprocess
1293
  gr.update(value="global-prediction", visible=False), # global_speaker
1294
- gr.update(value="", visible=True), # result_text
1295
  gr.update(value=None, visible=True), # output_video
 
1296
  gr.update(visible=True), # submit_button
1297
  gr.update(visible=True), # clear_button
1298
  gr.update(visible=True), # sync_examples
1299
- gr.update(visible=False) # asd_examples
 
1300
  )
1301
  else:
1302
  return (
@@ -1304,12 +1337,13 @@ if __name__ == "__main__":
1304
  gr.update(value=75, visible=True), # num_avg_frames
1305
  gr.update(value=None, visible=False), # apply_preprocess
1306
  gr.update(value="global-prediction", visible=True), # global_speaker
1307
- gr.update(value="", visible=True), # result_text
1308
  gr.update(value=None, visible=True), # output_video
 
1309
  gr.update(visible=True), # submit_button
1310
  gr.update(visible=True), # clear_button
1311
  gr.update(visible=False), # sync_examples
1312
- gr.update(visible=True) # asd_examples
 
1313
  )
1314
 
1315
  def clear_inputs():
@@ -1363,8 +1397,8 @@ if __name__ == "__main__":
1363
  outputs=num_avg_frames
1364
  )
1365
  with gr.Column():
1366
- result_text = gr.Textbox(label="Result", visible=False)
1367
  output_video = gr.Video(label="Output Video", height=400, visible=False)
 
1368
 
1369
  with gr.Row():
1370
  submit_button = gr.Button("Submit", variant="primary", visible=False)
@@ -1389,10 +1423,13 @@ if __name__ == "__main__":
1389
  visible=False
1390
  )
1391
 
 
 
 
1392
  demo_choice.change(
1393
  fn=toggle_demo,
1394
  inputs=demo_choice,
1395
- outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, result_text, output_video, submit_button, clear_button, sync_examples, asd_examples]
1396
  )
1397
 
1398
  sync_examples.select(
@@ -1411,7 +1448,7 @@ if __name__ == "__main__":
1411
  submit_button.click(
1412
  fn=process_video,
1413
  inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
1414
- outputs=[result_text, output_video]
1415
  )
1416
 
1417
  clear_button.click(
@@ -1420,5 +1457,6 @@ if __name__ == "__main__":
1420
  outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
1421
  )
1422
 
 
1423
  # Launch the interface
1424
  demo.launch(allowed_paths=["."], share=True)
 
196
  video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
197
 
198
  # Resample the video to 25 fps
 
199
  status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
200
  if status != 0:
201
  msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
 
342
  hand_count += 1
343
 
344
 
345
+ if hand_count/len(keypoints) > 0.6 or pose_count/len(keypoints) > 0.6:
346
  msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
347
  return msg
348
 
 
350
 
351
  return "success"
352
 
353
+ def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_frames=25, width=480, height=270):
354
 
355
  '''
356
  This function masks the faces using the keypoints extracted from the frames
 
369
  - msg (string) : Message to be returned
370
  '''
371
 
 
 
 
 
 
 
 
 
372
  print("Creating masked input frames...")
 
 
 
 
 
373
 
374
+ input_frames_masked = []
375
+ if kp_dict is None:
376
+ for img in tqdm(input_frames):
377
  img = cv2.resize(img, (width, height))
378
  masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
379
+ input_frames_masked.append(masked_img)
380
+
381
+ else:
382
+ # Face indices to extract the face-coordinates needed for masking
383
+ face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172,
384
+ 176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]
385
+
386
+ input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
387
+ print("Input keypoints: ", len(input_keypoints))
388
+
389
+ for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):
390
 
391
+ img = input_frames[i]
392
+ face = frame_kp_dict["face"]
 
 
393
 
394
+ if face is None:
395
+ img = cv2.resize(img, (width, height))
396
+ masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
397
+ else:
398
+ face_kps = []
399
+ for idx in range(len(face)):
400
+ if idx in face_oval_idx:
401
+ x, y = int(face[idx]["x"]*resolution[1]), int(face[idx]["y"]*resolution[0])
402
+ face_kps.append((x,y))
403
+
404
+ face_kps = np.array(face_kps)
405
+ x1, y1 = min(face_kps[:,0]), min(face_kps[:,1])
406
+ x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
407
+ masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)
408
+
409
+ if masked_img.shape[0] != width or masked_img.shape[1] != height:
410
+ masked_img = cv2.resize(masked_img, (width, height))
411
 
412
+ input_frames_masked.append(masked_img)
413
 
414
  orig_masked_frames = np.array(input_frames_masked)
415
  input_frames = np.array(input_frames_masked) / 255.
416
+ if asd:
417
+ input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')
418
+ # print("Input images full: ", input_frames.shape) # num_framesx270x480x3
419
 
420
  input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
421
+ # print("Input images window: ", input_frames.shape) # Tx25x270x480x3
422
 
423
  num_frames = input_frames.shape[0]
424
 
 
428
 
429
  return input_frames, num_frames, orig_masked_frames, "success"
430
 
431
+ def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, stride=4):
432
 
433
  '''
434
  This function extracts the spectrogram from the audio file
 
465
  if frame_diff > 60:
466
  print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")
467
 
468
+ if asd:
469
+ pad_frames = (window_frames//2)
470
+ spec = np.pad(spec, ((pad_frames, pad_frames), (0,0), (0,0)), 'edge')
471
+
472
  return spec, orig_spec, "success"
473
 
474
 
 
636
 
637
  if offset == 0:
638
  print("The input audio and video are in-sync! No need to perform sync correction.")
639
+ return video_path, "success"
640
 
641
  print("Performing Sync Correction...")
642
  corrected_frames = np.zeros_like(frames)
 
694
  print("Successfully extracted the keypoints")
695
 
696
  # Mask the frames using the keypoints extracted from the frames and prepare the input to the model
697
+ masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=True)
698
  if status != "success":
699
  return None, None, status
700
  print("Successfully loaded the masked frames")
 
818
  if global_score=="True":
819
  score = output.mean(0)
820
  else:
821
+ if output.shape[0]<num_avg_frames:
822
+ num_avg_frames = output.shape[0]
823
  output_batch = output.unfold(0, num_avg_frames, 1)
824
  score = torch.mean(output_batch, axis=-1)
825
 
 
837
  pred_idx = np.argmax(score)
838
  pred_speaker.append(pred_idx)
839
 
840
+ return pred_speaker, num_avg_frames
841
 
842
 
843
  def save_video(output_tracks, input_frames, wav_file, result_folder):
 
901
  print("Applying preprocessing: ", apply_preprocess)
902
  wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
903
  if status != "success":
904
+ return None, status
905
  print("Successfully preprocessed the video")
906
 
907
  # Resample the video to 25 fps if it is not already 25 fps
 
909
  if fps!=25:
910
  vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
911
  if status != "success":
912
+ return None, status
913
  orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
914
  if status != "success":
915
+ return None, status
916
  else:
917
  vid_path = vid_path_processed
918
  orig_vid_path_25fps = video_path
 
920
  # Load the original video frames (before pre-processing) - Needed for the final sync-correction
921
  orig_frames, status = load_video_frames(orig_vid_path_25fps)
922
  if status != "success":
923
+ return None, status
924
 
925
  # Load the pre-processed video frames
926
  frames, status = load_video_frames(vid_path)
927
  if status != "success":
928
+ return None, status
929
  print("Successfully extracted the video frames")
930
 
931
  if len(frames) < num_avg_frames:
932
+ msg = "Error: The input video is too short. Please use a longer input video."
933
+ return None, msg
934
 
935
  # Load keypoints and check if gestures are visible
936
  kp_dict, status = get_keypoints(frames)
937
  if status != "success":
938
+ return None, status
939
  print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))
940
 
941
  status = check_visible_gestures(kp_dict)
942
  if status != "success":
943
+ return None, status
944
 
945
  # Load RGB frames
946
+ rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
947
  if status != "success":
948
+ return None, status
949
  print("Successfully loaded the RGB frames")
950
 
951
  # Convert frames to tensor
 
955
  print("Successfully converted the frames to tensor")
956
 
957
  # Load spectrograms
958
+ spec, orig_spec, status = load_spectrograms(wav_file, asd=False, num_frames=num_frames)
959
  if status != "success":
960
+ return None, status
961
  spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
962
  print("Successfully loaded the spectrograms")
963
 
 
1008
  # Calculate sync offset
1009
  pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
1010
  if status != "success":
1011
+ return None, status
1012
  print("Predicted offset: ", pred_offset)
1013
 
1014
  # Generate sync-corrected video
1015
  video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
1016
  if status != "success":
1017
+ return None, status
1018
  print("Successfully generated the video:", video_output)
1019
 
1020
+ return video_output, f"Predicted offset: {pred_offset}"
1021
 
1022
  except Exception as e:
1023
+ return None, f"Error: {str(e)}"
1024
 
1025
  def process_video_activespeaker(video_path, global_speaker, num_avg_frames):
1026
  try:
 
1041
 
1042
  if global_speaker=="per-frame-prediction" and num_avg_frames<25:
1043
  msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
1044
+ return None, msg
1045
 
1046
  # Read the video
1047
  try:
1048
  vr = VideoReader(video_path, ctx=cpu(0))
1049
  except:
1050
  msg = "Oops! Could not load the input video file"
1051
+ return None, msg
1052
 
1053
  # Get the FPS of the video
1054
  fps = vr.get_avg_fps()
 
1058
  if fps!=25:
1059
  test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
1060
  if status != "success":
1061
+ return None, status
1062
  else:
1063
  test_video_25fps = video_path
1064
 
1065
  # Load the video frames
1066
  orig_frames, status = load_video_frames(test_video_25fps)
1067
  if status != "success":
1068
+ return None, status
1069
 
1070
  # Extract and save the audio file
1071
  orig_wav_file, status = extract_audio(video_path, result_folder)
1072
  if status != "success":
1073
+ return None, status
1074
 
1075
  # Pre-process and extract per-speaker tracks in each scene
1076
  print("Pre-processing the input video...")
1077
  status = subprocess.call("python preprocess/inference_preprocess.py --data_dir={}/temp --sd_root={}/crops --work_root={}/metadata --data_root={}".format(result_folder_input, result_folder_input, result_folder_input, video_path), shell=True)
1078
  if status != 0:
1079
+ msg = "Error in pre-processing the input video, please check the input video and try again..."
1080
+ return None, msg
1081
 
1082
  # Load the tracks file saved during pre-processing
1083
  with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
 
1110
 
1111
  if len(test_videos)<=1:
1112
  msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
1113
+ return None, msg
1114
 
1115
  # Load the audio file
1116
  audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
1117
+ spec, _, status = load_spectrograms(audio_file, asd=True)
1118
  if status != "success":
1119
+ return None, status
1120
  spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
1121
  print("Successfully loaded the spectrograms")
1122
 
1123
  # Load the masked input frames
1124
  all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
1125
  if status != "success":
1126
+ return None, status
1127
  print("Successfully loaded the masked input frames")
1128
 
1129
  # Prepare the audio and video sequences for the model
 
1144
 
1145
  # Predict the active speaker in each scene
1146
  if global_speaker=="per-frame-prediction":
1147
+ predictions, num_avg_frames = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
1148
  else:
1149
+ predictions, _ = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)
1150
 
1151
  # Get the frames present in the scene
1152
  frames_scene = tracks[scene_num][0]['track']['frame']
 
1168
  frame_pred = len(frames_scene)-(mid*2)
1169
  start, end = mid, len(frames_scene)-mid
1170
 
1171
+ print("Frame scene: {} | Avg frames: {} | Frame predictions: {}".format(len(frames_scene), num_avg_frames, frame_pred))
1172
  if len(predictions) != frame_pred:
1173
  msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
1174
+ return None, msg
1175
 
1176
  active_speakers[start:end] = predictions[0:]
1177
 
 
1193
  # Save the output video
1194
  video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
1195
  if status != "success":
1196
+ return None, status
1197
  print("Successfully saved the output video: ", video_output)
1198
 
1199
+ return video_output, "success"
1200
 
1201
  except Exception as e:
1202
+ return None, f"Error: {str(e)}"
1203
 
1204
  if __name__ == "__main__":
1205
 
 
1289
  <div class="header">
1290
  <h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
1291
  <h2>Synchronization and Active Speaker Detection Demo</h2>
1292
+ <p><a href='https://www.robots.ox.ac.uk/~vgg/research/gestsync/'>Project Page</a> | <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> | <a href='https://arxiv.org/abs/2310.05304'>Paper</a></p>
1293
+ </div>
1294
+ """
1295
+
1296
+
1297
+ tips = """
1298
+ <div>
1299
+ <br><br>
1300
+ Please give us a 🌟 on <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> if you like our work!
1301
+
1302
+ Tips to get better results:
1303
+ <ul>
1304
+ <li>Number of Average Frames: Higher the number, better the results.</li>
1305
+ <li>Clicking on "apply pre-processing" will give better results for synchornization, but this is an expensive operation and might take a while.</li>
1306
+ <li>Input videos with clearly visible gestures work better.</li>
1307
+ </ul>
1308
+
1309
  </div>
1310
  """
1311
 
 
1323
  gr.update(value=75, visible=True), # num_avg_frames
1324
  gr.update(value=None, visible=True), # apply_preprocess
1325
  gr.update(value="global-prediction", visible=False), # global_speaker
 
1326
  gr.update(value=None, visible=True), # output_video
1327
+ gr.update(value="", visible=True), # result_text
1328
  gr.update(visible=True), # submit_button
1329
  gr.update(visible=True), # clear_button
1330
  gr.update(visible=True), # sync_examples
1331
+ gr.update(visible=False), # asd_examples
1332
+ gr.update(visible=True) # tips
1333
  )
1334
  else:
1335
  return (
 
1337
  gr.update(value=75, visible=True), # num_avg_frames
1338
  gr.update(value=None, visible=False), # apply_preprocess
1339
  gr.update(value="global-prediction", visible=True), # global_speaker
 
1340
  gr.update(value=None, visible=True), # output_video
1341
+ gr.update(value="", visible=True), # result_text
1342
  gr.update(visible=True), # submit_button
1343
  gr.update(visible=True), # clear_button
1344
  gr.update(visible=False), # sync_examples
1345
+ gr.update(visible=True), # asd_examples
1346
+ gr.update(visible=True) # tips
1347
  )
1348
 
1349
  def clear_inputs():
 
1397
  outputs=num_avg_frames
1398
  )
1399
  with gr.Column():
 
1400
  output_video = gr.Video(label="Output Video", height=400, visible=False)
1401
+ result_text = gr.Textbox(label="Result", visible=False)
1402
 
1403
  with gr.Row():
1404
  submit_button = gr.Button("Submit", variant="primary", visible=False)
 
1423
  visible=False
1424
  )
1425
 
1426
+ tips = gr.Markdown(tips, visible=False)
1427
+
1428
+
1429
  demo_choice.change(
1430
  fn=toggle_demo,
1431
  inputs=demo_choice,
1432
+ outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, output_video, result_text, submit_button, clear_button, sync_examples, asd_examples, tips]
1433
  )
1434
 
1435
  sync_examples.select(
 
1448
  submit_button.click(
1449
  fn=process_video,
1450
  inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
1451
+ outputs=[output_video, result_text]
1452
  )
1453
 
1454
  clear_button.click(
 
1457
  outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
1458
  )
1459
 
1460
+
1461
  # Launch the interface
1462
  demo.launch(allowed_paths=["."], share=True)