File size: 61,140 Bytes
aa5ee46
3860ffa
 
aa5ee46
3860ffa
 
aa5ee46
3860ffa
aa5ee46
3860ffa
aa5ee46
c4c6512
3860ffa
 
aa5ee46
 
 
c4c6512
 
 
 
 
 
 
 
 
aa5ee46
3860ffa
4ad47a9
aa5ee46
 
 
4b11292
aa5ee46
3860ffa
 
 
aa5ee46
 
4ad47a9
aa5ee46
 
54b0fbe
4de346a
3860ffa
aa5ee46
c4c6512
 
 
 
 
 
 
f0d8178
3860ffa
 
 
 
c4c6512
a794399
c4c6512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7406842
c4c6512
 
 
 
 
 
 
 
e7ce0d3
c4c6512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b015d09
 
 
 
c4c6512
 
 
 
 
b015d09
 
 
 
c4c6512
b015d09
 
 
 
 
a794399
c4c6512
b015d09
c4c6512
 
b015d09
 
 
a794399
 
b015d09
c4c6512
a794399
 
 
c4c6512
b015d09
c4c6512
a794399
 
 
 
 
 
c4c6512
 
 
 
 
 
 
 
 
 
 
 
b015d09
 
 
7406842
342ecda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360ddab
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d916cf
 
360ddab
 
 
 
 
 
 
342ecda
 
afa2bc0
aa5ee46
360ddab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa5ee46
360ddab
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
4085332
aa5ee46
4085332
aa5ee46
 
 
4ad47a9
 
 
 
aa5ee46
 
4ad47a9
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
54b0fbe
aa5ee46
 
 
 
 
 
 
43bd4b0
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b29652
aa5ee46
 
 
 
 
 
 
4b29652
aa5ee46
 
 
 
 
 
 
4085332
aa5ee46
 
 
 
 
 
 
 
 
 
 
0a28502
aa5ee46
4b29652
 
 
aa5ee46
 
4b29652
 
 
 
 
 
 
 
 
 
 
aa5ee46
4b29652
 
aa5ee46
4b29652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa5ee46
4b29652
aa5ee46
 
 
4b29652
 
aa5ee46
 
f0d8178
aa5ee46
 
 
 
 
 
 
 
 
4b29652
aa5ee46
 
 
 
 
 
4085332
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d480ab
aa5ee46
 
 
 
4ad47a9
 
 
aa5ee46
 
 
 
4b29652
 
 
 
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16b19d3
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16b19d3
aa5ee46
 
 
3db6ff1
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
4085332
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ad47a9
 
 
aa5ee46
 
 
 
 
 
 
4ad47a9
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4085332
aa5ee46
 
 
 
4b29652
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ad47a9
 
 
aa5ee46
4ad47a9
aa5ee46
c10597c
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ad47a9
 
3860ffa
 
 
 
4ad47a9
3860ffa
 
e7ce0d3
 
 
 
3860ffa
 
e7ce0d3
3860ffa
 
4ad47a9
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
4ad47a9
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4085332
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
39c830f
449a2b2
3860ffa
 
 
 
 
 
 
 
4085332
3860ffa
 
 
 
 
 
 
 
 
449a2b2
3860ffa
a8e4601
3860ffa
449a2b2
 
3860ffa
449a2b2
3860ffa
 
 
a8e4601
449a2b2
c10597c
449a2b2
3860ffa
449a2b2
 
edf089c
449a2b2
 
 
3860ffa
 
 
 
 
ac6cb16
 
3860ffa
 
 
 
 
 
 
 
 
4085332
 
3860ffa
 
4085332
3860ffa
 
 
 
449a2b2
3860ffa
 
 
449a2b2
3860ffa
 
 
 
 
 
 
 
 
 
4b29652
 
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b29652
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
4085332
3860ffa
 
4ad47a9
 
 
 
 
 
 
 
 
 
 
3860ffa
4ad47a9
3860ffa
4ad47a9
 
 
 
 
 
 
3860ffa
4ad47a9
3860ffa
af9489c
 
4085332
 
 
 
 
 
 
 
 
 
c4c6512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b015d09
 
 
3257615
a794399
af9489c
 
3860ffa
 
4085332
 
 
 
 
 
 
 
 
 
 
 
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360ddab
 
aa5ee46
4b29652
c6e1104
aa5ee46
 
 
 
4ad47a9
 
4b29652
4ad47a9
 
4b29652
aa5ee46
 
 
 
 
 
 
4b29652
aa5ee46
 
 
 
4b29652
c6e1104
aa5ee46
 
4b29652
 
aa5ee46
638c0b5
 
 
 
 
aa5ee46
638c0b5
 
 
aa5ee46
 
4b29652
aa5ee46
4b29652
308542a
aa5ee46
 
 
e41c892
aa5ee46
c6e1104
aa5ee46
 
4b29652
aa5ee46
4b29652
aa5ee46
c6e1104
aa5ee46
 
 
 
 
 
 
 
c6e1104
aa5ee46
6272c46
4de346a
449a2b2
af9489c
aa5ee46
77624ae
449a2b2
aa5ee46
449a2b2
aa5ee46
 
 
 
 
 
 
 
 
 
449a2b2
342ecda
aa5ee46
 
342ecda
aa5ee46
 
4b29652
aa5ee46
 
 
4ad47a9
 
4b29652
aa5ee46
 
4b29652
aa5ee46
 
4b29652
aa5ee46
3b2ba8f
3860ffa
4085332
 
 
 
 
 
 
 
 
 
 
 
 
3860ffa
 
 
 
 
 
 
 
aa5ee46
3860ffa
 
aa5ee46
3860ffa
 
 
 
 
 
4b29652
3860ffa
 
 
 
 
 
4b29652
3860ffa
 
 
 
 
 
 
4ad47a9
 
4b29652
3860ffa
 
 
 
 
 
4b29652
3860ffa
 
 
 
4b29652
3860ffa
 
af9489c
 
 
 
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b29652
3860ffa
 
 
4b29652
3860ffa
4b29652
3860ffa
4ad47a9
3860ffa
 
 
 
4b29652
4ad47a9
3860ffa
 
 
 
 
 
 
 
 
 
 
449a2b2
3860ffa
449a2b2
3860ffa
4ad47a9
3860ffa
 
 
4b29652
3860ffa
4b29652
3860ffa
 
 
 
 
 
39c830f
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
4b29652
3860ffa
 
4b29652
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b29652
3860ffa
 
4b29652
3860ffa
 
4b29652
ac6cb16
3860ffa
ac6cb16
4ad47a9
3860ffa
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3860ffa
aa5ee46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3860ffa
4b29652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4085332
 
 
 
 
 
 
aa5ee46
 
 
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b29652
3860ffa
 
 
4b29652
 
3860ffa
 
 
 
 
 
 
 
4b29652
3860ffa
 
 
4b29652
 
3860ffa
 
 
 
 
 
 
 
 
 
 
aa5ee46
3860ffa
4ad47a9
a0b74a7
3860ffa
 
 
4ad47a9
 
3860ffa
 
aa5ee46
 
 
3860ffa
 
 
 
aa5ee46
 
3860ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa5ee46
3860ffa
4b29652
aa5ee46
 
3860ffa
 
aa5ee46
3860ffa
aa5ee46
 
4ad47a9
3860ffa
4ad47a9
 
 
 
3860ffa
 
 
4ad47a9
 
 
 
3860ffa
aa5ee46
 
4b29652
 
 
3860ffa
 
 
4b29652
3860ffa
 
4ad47a9
 
 
 
 
 
 
 
 
 
 
 
3860ffa
 
 
 
4b29652
3860ffa
 
 
 
 
 
 
4ad47a9
4b29652
aa5ee46
4ad47a9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
import gradio as gr
import os
import torch
from shutil import rmtree
from torch import nn
from torch.nn import functional as F
import numpy as np
import subprocess
import cv2
import pickle
import librosa
from ultralytics import YOLO
from decord import VideoReader
from decord import cpu, gpu
from utils.audio_utils import *
from utils.inference_utils import *
from sync_models.gestsync_models import *
from shutil import rmtree, copy, copytree
import scenedetect
from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.stats_manager import StatsManager
from scenedetect.detectors import ContentDetector
from scipy.interpolate import interp1d
from scipy import signal

from tqdm import tqdm
from glob import glob
from scipy.io.wavfile import write
import mediapipe as mp
from protobuf_to_dict import protobuf_to_dict
import warnings
import spaces

mp_holistic = mp.solutions.holistic
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Initialize global variables
CHECKPOINT_PATH = "model_rgb.pth"  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_cuda = torch.cuda.is_available()
print("Use cuda status: ", use_cuda)
batch_size = 24
fps = 25
n_negative_samples = 100

facedet_scale=0.25
crop_scale=0
min_track=50
frame_rate=25
num_failed_det=25
min_frame_size=64   
print("Device: ", device)

# Initialize the mediapipe holistic keypoint detection model
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)



def bb_intersection_over_union(boxA, boxB):
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxB[3], boxB[3])

	interArea = max(0, xB - xA) * max(0, yB - yA)

	boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
	boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])

	iou = interArea / float(boxAArea + boxBArea - interArea)

	return iou

def track_shot(scenefaces):
	iouThres = 0.5  # Minimum IOU between consecutive face detections
	tracks = []

	while True:
		track = []
		for framefaces in scenefaces:
			for face in framefaces:
				if track == []:
					track.append(face)
					framefaces.remove(face)
				elif face['frame'] - track[-1]['frame'] <= num_failed_det:
					iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
					if iou > iouThres:
						track.append(face)
						framefaces.remove(face)
						continue
				else:
					break

		if track == []:
			break
		elif len(track) > min_track:
			framenum = np.array([f['frame'] for f in track])
			bboxes = np.array([np.array(f['bbox']) for f in track])

			frame_i = np.arange(framenum[0], framenum[-1] + 1)

			bboxes_i = []
			for ij in range(0, 4):
				interpfn = interp1d(framenum, bboxes[:, ij])
				bboxes_i.append(interpfn(frame_i))
			bboxes_i = np.stack(bboxes_i, axis=1)

			if max(np.mean(bboxes_i[:, 2] - bboxes_i[:, 0]), np.mean(bboxes_i[:, 3] - bboxes_i[:, 1])) > min_frame_size:
				tracks.append({'frame': frame_i, 'bbox': bboxes_i})

	return tracks

def check_folder(folder):
	if os.path.exists(folder):
		return True
	return False

def del_folder(folder):
	if os.path.exists(folder):
		rmtree(folder)

def read_video(o, start_idx):
	with open(o, 'rb') as o:
		video_stream = VideoReader(o)
		if start_idx > 0:
			video_stream.skip_frames(start_idx)
		return video_stream

def crop_video(avi_dir, tmp_dir, track, cropfile, tight_scale=1):
	fourcc = cv2.VideoWriter_fourcc(*'XVID')
	vOut = cv2.VideoWriter(cropfile + '.avi', fourcc, frame_rate, (480, 270))

	dets = {'x': [], 'y': [], 's': [], 'bbox': track['bbox'], 'frame': track['frame']}

	for det in track['bbox']:
		# Reduce the size of the bounding box by a small factor if tighter crops are needed (default -> no reduction in size)
		width = (det[2] - det[0]) * tight_scale
		height = (det[3] - det[1]) * tight_scale
		center_x = (det[0] + det[2]) / 2
		center_y = (det[1] + det[3]) / 2

		dets['s'].append(max(height, width) / 2)
		dets['y'].append(center_y)  # crop center y
		dets['x'].append(center_x)  # crop center x

	# Smooth detections
	dets['s'] = signal.medfilt(dets['s'], kernel_size=13)
	dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
	dets['y'] = signal.medfilt(dets['y'], kernel_size=13)

	videofile = os.path.join(avi_dir, 'video.avi')
	frame_no_to_start = track['frame'][0]
	video_stream = cv2.VideoCapture(videofile)
	video_stream.set(cv2.CAP_PROP_POS_FRAMES, frame_no_to_start)
	for fidx, frame in enumerate(track['frame']):
		cs = crop_scale
		bs = dets['s'][fidx]  # Detection box size
		bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount

		image = video_stream.read()[1]
		frame = np.pad(image, ((bsi, bsi), (bsi, bsi), (0, 0)), 'constant', constant_values=(110, 110))

		my = dets['y'][fidx] + bsi  # BBox center Y
		mx = dets['x'][fidx] + bsi  # BBox center X

		face = frame[int(my - bs):int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)):int(mx + bs * (1 + cs))]
		vOut.write(cv2.resize(face, (480, 270)))
	video_stream.release()
	audiotmp = os.path.join(tmp_dir, 'audio.wav')
	audiostart = (track['frame'][0]) / frame_rate
	audioend = (track['frame'][-1] + 1) / frame_rate

	vOut.release()

	# ========== CROP AUDIO FILE ==========

	command = ("ffmpeg -hide_banner -loglevel panic -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(avi_dir, 'audio.wav'), audiostart, audioend, audiotmp))
	output = subprocess.call(command, shell=True, stdout=None)

	copy(audiotmp, cropfile + '.wav')

	# print('Written %s' % cropfile)
	# print('Mean pos: x %.2f y %.2f s %.2f' % (np.mean(dets['x']), np.mean(dets['y']), np.mean(dets['s'])))

	return {'track': track, 'proc_track': dets}

@spaces.GPU(duration=60)
def inference_video(avi_dir, work_dir, padding=0):
	videofile = os.path.join(avi_dir, 'video.avi')
	vidObj = cv2.VideoCapture(videofile)
	yolo_model = YOLO("yolov9m.pt")
	global dets, fidx
	dets = []
	fidx = 0

	print("Detecting people in the video using YOLO...")
	def generate_detections():
		global dets, fidx
		while True:
			success, image = vidObj.read()
			if not success:
				break

			image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

			# Perform person detection
			results = yolo_model(image_np, verbose=False)
			detections = results[0].boxes

			dets.append([])
			for i, det in enumerate(detections):
				x1, y1, x2, y2 = det.xyxy[0].detach().cpu().numpy()
				cls = det.cls[0].detach().cpu().numpy()
				conf = det.conf[0].detach().cpu().numpy()
				if int(cls) == 0 and conf>0.7:  # Class 0 is 'person' in COCO dataset
					x1 = max(0, int(x1) - padding)
					y1 = max(0, int(y1) - padding)
					x2 = min(image_np.shape[1], int(x2) + padding)
					y2 = min(image_np.shape[0], int(y2) + padding)
					dets[-1].append({'frame': fidx, 'bbox': [x1, y1, x2, y2], 'conf': conf})

			fidx += 1
			yield

		return dets

	for _ in tqdm(generate_detections()): 
		pass


	print("Successfully detected people in the video")
	savepath = os.path.join(work_dir, 'faces.pckl')

	with open(savepath, 'wb') as fil:
		pickle.dump(dets, fil)

	return dets

def scene_detect(avi_dir, work_dir):
	video_manager = VideoManager([os.path.join(avi_dir, 'video.avi')])
	stats_manager = StatsManager()
	scene_manager = SceneManager(stats_manager)
	scene_manager.add_detector(ContentDetector())
	base_timecode = video_manager.get_base_timecode()

	video_manager.set_downscale_factor()
	video_manager.start()
	scene_manager.detect_scenes(frame_source=video_manager)
	scene_list = scene_manager.get_scene_list(base_timecode)

	savepath = os.path.join(work_dir, 'scene.pckl')

	if scene_list == []:
		scene_list = [(video_manager.get_base_timecode(), video_manager.get_current_timecode())]

	with open(savepath, 'wb') as fil:
		pickle.dump(scene_list, fil)

	print('%s - scenes detected %d' % (os.path.join(avi_dir, 'video.avi'), len(scene_list)))

	return scene_list


def process_video_asd(file, sd_root, work_root, data_root, avi_dir, tmp_dir, work_dir, crop_dir, frames_dir):

	video_file_name = os.path.basename(file.strip())
	sd_dest_folder = sd_root
	work_dest_folder = work_root


	del_folder(sd_dest_folder)
	del_folder(work_dest_folder)

	videofile = file

	if os.path.exists(work_dir):
		rmtree(work_dir)

	if os.path.exists(crop_dir):
		rmtree(crop_dir)

	if os.path.exists(avi_dir):
		rmtree(avi_dir)

	if os.path.exists(frames_dir):
		rmtree(frames_dir)

	if os.path.exists(tmp_dir):
		rmtree(tmp_dir)

	os.makedirs(work_dir)
	os.makedirs(crop_dir)
	os.makedirs(avi_dir)
	os.makedirs(frames_dir)
	os.makedirs(tmp_dir)

	command = ("ffmpeg -hide_banner -loglevel panic -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (videofile, 
																os.path.join(avi_dir, 
																'video.avi')))
	status = subprocess.call(command, shell=True, stdout=None)
	if status != 0:
		msg = "Error in pre-processing the video, please check the input video and try again"
		return msg

	command = ("ffmpeg -hide_banner -loglevel panic -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(avi_dir,
																			 'video.avi'), 
																			 os.path.join(avi_dir, 
																			'audio.wav')))
	status = subprocess.call(command, shell=True, stdout=None)
	if status != 0:
		msg = "Error in pre-processing the video, please check the input video and try again"
		return msg

	try:
		faces = inference_video(avi_dir, work_dir)
	except:
		msg = "Error in pre-processing the video, please check the input video and try again"
		return msg
	print("YOLO done")

	print("Detecting scenes in the video...")
	try:
		scene = scene_detect(avi_dir, work_dir)
	except:
		msg = "Error in detecting the scenes in the video, please check the input video and try again"
		return msg
	print("Scene detect done")

	print("Tracking video...")
	allscenes = []
	for shot in scene:
		if shot[1].frame_num - shot[0].frame_num >= min_track:
			allscenes.append(track_shot(faces[shot[0].frame_num:shot[1].frame_num]))

	print("Cropping video...")
	alltracks = []
	for sc_num in range(len(allscenes)):
		vidtracks = []
		for ii, track in enumerate(allscenes[sc_num]):
			os.makedirs(os.path.join(crop_dir, 'scene_'+str(sc_num)), exist_ok=True)
			vidtracks.append(crop_video(avi_dir, tmp_dir, track, os.path.join(crop_dir, 'scene_'+str(sc_num), '%05d' % ii)))
		alltracks.append(vidtracks)

	savepath = os.path.join(work_dir, 'tracks.pckl')

	with open(savepath, 'wb') as fil:
		pickle.dump(alltracks, fil)

	rmtree(tmp_dir)
	rmtree(avi_dir)
	rmtree(frames_dir)
	copytree(crop_dir, sd_dest_folder)
	copytree(work_dir, work_dest_folder)

	return "success"


@spaces.GPU(duration=60)
def get_person_detection(all_frames, frame_count, padding=20):

	try:
		# Load YOLOv9 model (pre-trained on COCO dataset)
		yolo_model = YOLO("yolov9s.pt")
		print("Loaded the YOLO model")

		person_videos = {}
		person_tracks = {}

		print("Processing the frames...")
		for frame_idx in tqdm(range(frame_count)):
		
			frame = all_frames[frame_idx]
		
			# Perform person detection
			results = yolo_model(frame, verbose=False)
			detections = results[0].boxes
		
			for i, det in enumerate(detections):
				x1, y1, x2, y2 = det.xyxy[0]
				cls = det.cls[0]
				if int(cls) == 0:  # Class 0 is 'person' in COCO dataset
				
					x1 = max(0, int(x1) - padding)
					y1 = max(0, int(y1) - padding)
					x2 = min(frame.shape[1], int(x2) + padding)
					y2 = min(frame.shape[0], int(y2) + padding)

					if i not in person_videos:
						person_videos[i] = []
						person_tracks[i] = []

					person_videos[i].append(frame)
					person_tracks[i].append([x1,y1,x2,y2])
			
		
		num_persons = 0
		for i in person_videos.keys():
			if len(person_videos[i]) >= frame_count//2:
				num_persons+=1

		if num_persons==0:
			msg = "No person detected in the video! Please give a video with one person as input"
			return None, None, msg
		if num_persons>1:
			msg = "More than one person detected in the video! Please give a video with only one person as input"
			return None, None, msg

	except:
		msg = "Error in detecting person in the video, please check the input video and try again"
		return None, None, msg

	return person_videos, person_tracks, "success"

def preprocess_video(path, result_folder, apply_preprocess, padding=20):

	'''
	This function preprocesses the input video to extract the audio and crop the frames using YOLO model

	Args:
		- path (string) : Path of the input video file
		- result_folder (string) : Path of the folder to save the extracted audio and cropped video
		- padding (int) : Padding to add to the bounding box
	Returns:
		- wav_file (string) : Path of the extracted audio file
		- fps (int) : FPS of the input video
		- video_output (string) : Path of the cropped video file
		- msg (string) : Message to be returned
	'''
	
	# Load all video frames
	try:
		vr = VideoReader(path, ctx=cpu(0))
		fps = vr.get_avg_fps()
		frame_count = len(vr)
	except:
		msg = "Oops! Could not load the video. Please check the input video and try again."
		return None, None, None, msg

	if frame_count < 25:
		msg = "Not enough frames to process! Please give a longer video as input"
		return None, None, None, msg

	# Extract the audio from the input video file using ffmpeg
	wav_file  = os.path.join(result_folder, "audio.wav")

	status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -async 1 -ac 1 -vn \
					-acodec pcm_s16le -ar 16000 %s -y' % (path, wav_file), shell=True)

	if status != 0:
		msg = "Oops! Could not load the audio file. Please check the input video and try again."
		return None, None, None, msg
	print("Extracted the audio from the video")

	if apply_preprocess=="True":
		all_frames = []
		for k in range(len(vr)):
				all_frames.append(vr[k].asnumpy())
		all_frames = np.asarray(all_frames)
		print("Extracted the frames for pre-processing")

		person_videos, person_tracks, msg = get_person_detection(all_frames, frame_count, padding)
		if msg != "success":
			return None, None, None, msg		

		# For the person detected, crop the frame based on the bounding box
		if len(person_videos[0]) > frame_count-10:
			crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
			fourcc = cv2.VideoWriter_fourcc(*'DIVX')

			# Get bounding box coordinates based on person_tracks[i]
			max_x1 = min([track[0] for track in person_tracks[0]])
			max_y1 = min([track[1] for track in person_tracks[0]])
			max_x2 = max([track[2] for track in person_tracks[0]])
			max_y2 = max([track[3] for track in person_tracks[0]])

			max_width = max_x2 - max_x1
			max_height = max_y2 - max_y1

			out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
			for frame in person_videos[0]:
				crop = frame[max_y1:max_y2, max_x1:max_x2]
				crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
				out.write(crop)
			out.release()

			no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
			status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
			if status != 0:
				msg = "Oops! Could not preprocess the video. Please check the input video and try again."
				return None, None, None, msg
			
			video_output = crop_filename.split('.')[0] + '.mp4'
			status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' % 
							(wav_file , no_sound_video, video_output), shell=True)
			if status != 0:
				msg = "Oops! Could not preprocess the video. Please check the input video and try again."
				return None, None, None, msg
			
			os.remove(crop_filename)
			os.remove(no_sound_video)

			print("Successfully saved the pre-processed video: ", video_output)
		else:
			msg = "Could not track the person in the full video! Please give a single-speaker video as input"
			return None, None, None, msg

	else:
		video_output = path

	return wav_file, fps, video_output, "success"

def resample_video(video_file, video_fname, result_folder):

	'''
	This function resamples the video to 25 fps

	Args:
		- video_file (string) : Path of the input video file
		- video_fname (string) : Name of the input video file
		- result_folder (string) : Path of the folder to save the resampled video
	Returns:
		- video_file_25fps (string) : Path of the resampled video file
		- msg (string) : Message to be returned
	'''

	video_file_25fps = os.path.join(result_folder, '{}.mp4'.format(video_fname))
	
	# Resample the video to 25 fps
	status = subprocess.call("ffmpeg -hide_banner -loglevel panic -y -i {} -c:v libx264 -preset veryslow -crf 0 -filter:v fps=25 -pix_fmt yuv420p {}".format(video_file, video_file_25fps), shell=True)
	if status != 0:
		msg = "Oops! Could not resample the video to 25 FPS. Please check the input video and try again."
		return None, msg
	print('Resampled the video to 25 fps: {}'.format(video_file_25fps))

	return video_file_25fps, "success"

def load_checkpoint(path, model):
	'''
	This function loads the trained model from the checkpoint

	Args:
		- path (string) : Path of the checkpoint file
		- model (object) : Model object
	Returns:
		- model (object) : Model object with the weights loaded from the checkpoint
	'''	

	# Load the checkpoint
	checkpoint = torch.load(path, map_location="cpu")
	
	s = checkpoint["state_dict"]
	new_s = {}
	
	for k, v in s.items():
		new_s[k.replace('module.', '')] = v
	model.load_state_dict(new_s)

	print("Loaded checkpoint from: {}".format(path))

	return model.eval()


def load_video_frames(video_file):
	'''
	This function extracts the frames from the video

	Args:
		- video_file (string) : Path of the video file
	Returns:
		- frames (list) : List of frames extracted from the video
		- msg (string) : Message to be returned
	'''

	# Read the video
	try:
		vr = VideoReader(video_file, ctx=cpu(0))
	except:
		msg = "Oops! Could not load the input video file"
		return None, msg


	# Extract the frames
	frames = []
	for k in range(len(vr)):
		frames.append(vr[k].asnumpy())

	frames = np.asarray(frames)

	return frames, "success"



def get_keypoints(frames):

	'''
	This function extracts the keypoints from the frames using MediaPipe Holistic pipeline

	Args:
		- frames (list) : List of frames extracted from the video
	Returns:
		- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
		- msg (string) : Message to be returned
	'''

	try:
		holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) 

		resolution = frames[0].shape
		all_frame_kps = []

		for frame in frames:

			results = holistic.process(frame)

			pose, left_hand, right_hand, face = None, None, None, None
			if results.pose_landmarks is not None:
				pose = protobuf_to_dict(results.pose_landmarks)['landmark']
			if results.left_hand_landmarks is not None:
				left_hand = protobuf_to_dict(results.left_hand_landmarks)['landmark']
			if results.right_hand_landmarks is not None:
				right_hand = protobuf_to_dict(results.right_hand_landmarks)['landmark']
			if results.face_landmarks is not None:
				face = protobuf_to_dict(results.face_landmarks)['landmark']

			frame_dict = {"pose":pose, "left_hand":left_hand, "right_hand":right_hand, "face":face}

			all_frame_kps.append(frame_dict)

		kp_dict = {"kps":all_frame_kps, "resolution":resolution}
	except Exception as e:
		print("Error: ", e)
		return None, "Error: Could not extract keypoints from the frames"

	return kp_dict, "success"


def check_visible_gestures(kp_dict):

	'''
	This function checks if the gestures in the video are visible

	Args:
		- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
	Returns:
		- msg (string) : Message to be returned
	'''

	keypoints = kp_dict['kps']
	keypoints = np.array(keypoints)

	if len(keypoints)<25:
		msg = "Not enough keypoints to process! Please give a longer video as input"
		return msg
	
	pose_count, hand_count = 0, 0
	for frame_kp_dict in keypoints:

		pose = frame_kp_dict["pose"]
		left_hand = frame_kp_dict["left_hand"]
		right_hand = frame_kp_dict["right_hand"]

		if pose is None:
			pose_count += 1
		
		if left_hand is None and right_hand is None:
			hand_count += 1


	if hand_count/len(keypoints) > 0.6 or pose_count/len(keypoints) > 0.6:
		msg = "The gestures in the input video are not visible! Please give a video with visible gestures as input."
		return msg

	print("Successfully verified the input video - Gestures are visible!")

	return "success"

def load_rgb_masked_frames(input_frames, kp_dict, asd=False, stride=1, window_frames=25, width=480, height=270):

	'''
	This function masks the faces using the keypoints extracted from the frames

	Args:
		- input_frames (list) : List of frames extracted from the video
		- kp_dict (dict) : Dictionary containing the keypoints and the resolution of the frames
		- asd (bool) : Whether to use padding (needed for active speaker detection task) or not
		- stride (int) : Stride to extract the frames
		- window_frames (int) : Number of frames in each window that is given as input to the model
		- width (int) : Width of the frames
		- height (int) : Height of the frames
	Returns:
		- input_frames (array) : Frame window to be given as input to the model
		- num_frames (int) : Number of frames to extract
		- orig_masked_frames (array) : Masked frames extracted from the video
		- msg (string) : Message to be returned
	'''

	print("Creating masked input frames...")

	input_frames_masked = []
	if kp_dict is None:
		for img in tqdm(input_frames):
			img = cv2.resize(img, (width, height))
			masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
			input_frames_masked.append(masked_img)

	else:
		# Face indices to extract the face-coordinates needed for masking
		face_oval_idx = [10, 21, 54, 58, 67, 93, 103, 109, 127, 132, 136, 148, 149, 150, 152, 162, 172, 
						176, 234, 251, 284, 288, 297, 323, 332, 338, 356, 361, 365, 377, 378, 379, 389, 397, 400, 454]

		input_keypoints, resolution = kp_dict['kps'], kp_dict['resolution']
		print("Input keypoints: ", len(input_keypoints))

		for i, frame_kp_dict in tqdm(enumerate(input_keypoints)):

			img = input_frames[i]
			face = frame_kp_dict["face"]

			if face is None:
				img = cv2.resize(img, (width, height))
				masked_img = cv2.rectangle(img, (0,0), (width,110), (0,0,0), -1)
			else:
				face_kps = []
				for idx in range(len(face)):
					if idx in face_oval_idx:
						x, y = int(face[idx]["x"]*resolution[1]), int(face[idx]["y"]*resolution[0])
						face_kps.append((x,y))

				face_kps = np.array(face_kps)
				x1, y1 = min(face_kps[:,0]), min(face_kps[:,1])
				x2, y2 = max(face_kps[:,0]), max(face_kps[:,1])
				masked_img = cv2.rectangle(img, (0,0), (resolution[1],y2+15), (0,0,0), -1)

			if masked_img.shape[0] != width or masked_img.shape[1] != height:
				masked_img = cv2.resize(masked_img, (width, height))

			input_frames_masked.append(masked_img)

	orig_masked_frames = np.array(input_frames_masked)
	input_frames = np.array(input_frames_masked) / 255.
	if asd:
		input_frames = np.pad(input_frames, ((12, 12), (0,0), (0,0), (0,0)), 'edge')

	input_frames = np.array([input_frames[i:i+window_frames, :, :] for i in range(0,input_frames.shape[0], stride) if (i+window_frames <= input_frames.shape[0])])
	print("Successfully created masked input frames")
	
	num_frames = input_frames.shape[0]

	if num_frames<10:
		msg = "Not enough frames to process! Please give a longer video as input."
		return None, None, None, msg
	
	return input_frames, num_frames, orig_masked_frames, "success"

def load_spectrograms(wav_file, asd=False, num_frames=None, window_frames=25, stride=4):

	'''
	This function extracts the spectrogram from the audio file

	Args:
		- wav_file (string) : Path of the extracted audio file
		- asd (bool) : Whether to use padding (needed for active speaker detection task) or not
		- num_frames (int) : Number of frames to extract
		- window_frames (int) : Number of frames in each window that is given as input to the model
		- stride (int) : Stride to extract the audio frames
	Returns:
		- spec (array) : Spectrogram array window to be used as input to the model
		- orig_spec (array) : Spectrogram array extracted from the audio file
		- msg (string) : Message to be returned
	'''

	# Extract the audio from the input video file using ffmpeg
	try:
		wav = librosa.load(wav_file, sr=16000)[0]
	except:
		msg = "Oops! Could extract the spectrograms from the audio file. Please check the input and try again."
		return None, None, msg
	
	# Convert to tensor
	wav = torch.FloatTensor(wav).unsqueeze(0)
	mel, _, _, _ = wav2filterbanks(wav)
	spec = mel.squeeze(0).cpu().numpy()
	orig_spec = spec
	spec = np.array([spec[i:i+(window_frames*stride), :] for i in range(0, spec.shape[0], stride) if (i+(window_frames*stride) <= spec.shape[0])])

	if num_frames is not None:
		if len(spec) != num_frames:
			spec = spec[:num_frames]
		frame_diff = np.abs(len(spec) - num_frames)
		if frame_diff > 60:
			print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")

	if asd:
		pad_frames = (window_frames//2)
		spec = np.pad(spec, ((pad_frames, pad_frames), (0,0), (0,0)), 'edge')

	return spec, orig_spec, "success"


def calc_optimal_av_offset(vid_emb, aud_emb, num_avg_frames, model):
	'''
	This function calculates the audio-visual offset between the video and audio

	Args:
		- vid_emb (array) : Video embedding array
		- aud_emb (array) : Audio embedding array
		- num_avg_frames (int) : Number of frames to average the scores
		- model (object) : Model object
	Returns:
		- offset (int) : Optimal audio-visual offset
		- msg (string) : Message to be returned
	'''

	pos_vid_emb, all_aud_emb, pos_idx, stride, status = create_online_sync_negatives(vid_emb, aud_emb, num_avg_frames)
	if status != "success":
		return None, status
	scores, _ = calc_av_scores(pos_vid_emb, all_aud_emb, model)
	offset = scores.argmax()*stride - pos_idx

	return offset.item(), "success"

def create_online_sync_negatives(vid_emb, aud_emb, num_avg_frames, stride=5):

	'''
	This function creates all possible positive and negative audio embeddings to compare and obtain the sync offset

	Args:
		- vid_emb (array) : Video embedding array
		- aud_emb (array) : Audio embedding array
		- num_avg_frames (int) : Number of frames to average the scores
		- stride (int) : Stride to extract the negative windows
	Returns:
		- vid_emb_pos (array) : Positive video embedding array
		- aud_emb_posneg (array) : All possible combinations of audio embedding array 
		- pos_idx_frame (int) : Positive video embedding array frame
		- stride (int) : Stride used to extract the negative windows
		- msg (string) : Message to be returned
	'''

	slice_size = num_avg_frames
	aud_emb_posneg = aud_emb.squeeze(1).unfold(-1, slice_size, stride)
	aud_emb_posneg = aud_emb_posneg.permute([0, 2, 1, 3])
	aud_emb_posneg = aud_emb_posneg[:, :int(n_negative_samples/stride)+1]

	pos_idx = (aud_emb_posneg.shape[1]//2)
	pos_idx_frame = pos_idx*stride

	min_offset_frames = -(pos_idx)*stride
	max_offset_frames = (aud_emb_posneg.shape[1] - pos_idx - 1)*stride
	print("With the current video length and the number of average frames, the model can predict the offsets in the range: [{}, {}]".format(min_offset_frames, max_offset_frames))

	vid_emb_pos = vid_emb[:, :, pos_idx_frame:pos_idx_frame+slice_size]
	if vid_emb_pos.shape[2] != slice_size:
		msg = "Video is too short to use {} frames to average the scores. Please use a longer input video or reduce the number of average frames".format(slice_size)
		return None, None, None, None, msg
	
	return vid_emb_pos, aud_emb_posneg, pos_idx_frame, stride, "success"

def calc_av_scores(vid_emb, aud_emb, model):

	'''
	This function calls functions to calculate the audio-visual similarity and attention map between the video and audio embeddings

	Args:
		- vid_emb (array) : Video embedding array
		- aud_emb (array) : Audio embedding array
		- model (object) : Model object
	Returns:
		- scores (array) : Audio-visual similarity scores
		- att_map (array) : Attention map
	'''

	scores = calc_att_map(vid_emb, aud_emb, model)
	att_map = logsoftmax_2d(torch.Tensor(scores))
	scores = scores.mean(-1)
	
	return scores, att_map

def calc_att_map(vid_emb, aud_emb, model):

	'''
	This function calculates the similarity between the video and audio embeddings

	Args:
		- vid_emb (array) : Video embedding array
		- aud_emb (array) : Audio embedding array
		- model (object) : Model object
	Returns:
		- scores (array) : Audio-visual similarity scores
	'''

	vid_emb = vid_emb[:, :, None]
	aud_emb = aud_emb.transpose(1, 2)

	scores = run_func_in_parts(lambda x, y: (x * y).sum(1),
							   vid_emb,
							   aud_emb,
							   part_len=10,
							   dim=3)

	scores = model.logits_scale(scores[..., None]).squeeze(-1)

	return scores.detach().cpu().numpy()

def generate_video(frames, audio_file, video_fname):
	
	'''
	This function generates the video from the frames and audio file

	Args:
		- frames (array) : Frames to be used to generate the video
		- audio_file (string) : Path of the audio file
		- video_fname (string) : Path of the video file
	Returns:
		- video_output (string) : Path of the video file
		- msg (string) : Message to be returned
	'''	

	fname = 'inference.avi'
	video = cv2.VideoWriter(fname, cv2.VideoWriter_fourcc(*'DIVX'), 25, (frames[0].shape[1], frames[0].shape[0]))

	for i in range(len(frames)):
		video.write(cv2.cvtColor(frames[i], cv2.COLOR_BGR2RGB))
	video.release()
	
	no_sound_video = video_fname + '_nosound.mp4'
	status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (fname, no_sound_video), shell=True)
	if status != 0:
		msg = "Oops! Could not generate the video. Please check the input video and try again."
		return None, msg

	video_output = video_fname + '.mp4'
	status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -c:v libx264 -preset veryslow -crf 18 -pix_fmt yuv420p -strict -2 -q:v 1 -shortest %s' % 
				(audio_file, no_sound_video, video_output), shell=True)

	if status != 0:
		msg = "Oops! Could not generate the video. Please check the input video and try again."
		return None, msg

	os.remove(fname)
	os.remove(no_sound_video)
	
	return video_output, "success"

def sync_correct_video(video_path, frames, wav_file, offset, result_folder, sample_rate=16000, fps=25):

	'''
	This function corrects the video and audio to sync with each other

	Args:
		- video_path (string) : Path of the video file
		- frames (array) : Frames to be used to generate the video
		- wav_file (string) : Path of the audio file
		- offset (int) : Predicted sync-offset to be used to correct the video
		- result_folder (string) : Path of the result folder to save the output sync-corrected video
		- sample_rate (int) : Sample rate of the audio
		- fps (int) : Frames per second of the video
	Returns:
		- video_output (string) : Path of the video file
		- msg (string) : Message to be returned
	'''

	if offset == 0:
		print("The input audio and video are in-sync! No need to perform sync correction.")
		return video_path, "success"
	
	print("Performing Sync Correction...")
	corrected_frames = np.zeros_like(frames)
	if offset > 0:
		audio_offset = int(offset*(sample_rate/fps))
		wav = librosa.core.load(wav_file, sr=sample_rate)[0]
		corrected_wav = wav[audio_offset:]
		corrected_wav_file = os.path.join(result_folder, "audio_sync_corrected.wav")
		write(corrected_wav_file, sample_rate, corrected_wav)
		wav_file = corrected_wav_file
		corrected_frames = frames
	elif offset < 0:
		corrected_frames[0:len(frames)+offset] = frames[np.abs(offset):]
		corrected_frames = corrected_frames[:len(frames)-np.abs(offset)]

	corrected_video_path = os.path.join(result_folder, "result_sync_corrected")
	video_output, status = generate_video(corrected_frames, wav_file, corrected_video_path)
	if status != "success":
		return None, status

	return video_output, "success"


def load_masked_input_frames(test_videos, spec, wav_file, scene_num, result_folder):

	'''
	This function loads the masked input frames from the video

	Args:
		- test_videos (list) : List of videos to be processed (speaker-specific tracks)
		- spec (array) : Spectrogram of the audio
		- wav_file (string) : Path of the audio file
		- scene_num (int) : Scene number to be used to save the input masked video
		- result_folder (string) : Path of the folder to save the input masked video
	Returns:
		- all_frames (list) : List of masked input frames window to be used as input to the model
		- all_orig_frames (list) : List of original masked input frames
	'''

	all_frames, all_orig_frames = [], []
	for video_num, video in enumerate(test_videos):

		print("Processing video: ", video)

		# Load the video frames
		frames, status = load_video_frames(video)
		if status != "success":
			return None, None, status
		print("Successfully loaded the video frames")

		# Extract the keypoints from the frames
		# kp_dict, status = get_keypoints(frames)
		# if status != "success":
		# 	return None, None, status
		# print("Successfully extracted the keypoints")

		# Mask the frames using the keypoints extracted from the frames and prepare the input to the model
		masked_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict=None, asd=True)
		if status != "success":
			return None, None, status
		print("Successfully loaded the masked frames")


		# Check if the length of the input frames is equal to the length of the spectrogram
		if spec.shape[2]!=masked_frames.shape[0]:
			num_frames = spec.shape[2]
			masked_frames = masked_frames[:num_frames]
			orig_masked_frames = orig_masked_frames[:num_frames]
			frame_diff = np.abs(spec.shape[2] - num_frames)
			if frame_diff > 60:
				print("The input video and audio length do not match - The results can be unreliable! Please check the input video.")

		# Transpose the frames to the correct format
		frames = np.transpose(masked_frames, (4, 0, 1, 2, 3))
		frames = torch.FloatTensor(np.array(frames)).unsqueeze(0)
		print("Successfully converted the frames to tensor")

		all_frames.append(frames)
		all_orig_frames.append(orig_masked_frames)


	return all_frames, all_orig_frames, "success"

def extract_audio(video, result_folder):

	'''
	This function extracts the audio from the video file

	Args:
		- video (string) : Path of the video file
		- result_folder (string) : Path of the folder to save the extracted audio file
	Returns:
		- wav_file (string) : Path of the extracted audio file
		- msg (string) : Message to be returned
	'''

	wav_file  = os.path.join(result_folder, "audio.wav")

	status = subprocess.call('ffmpeg -hide_banner -loglevel panic -threads 1 -y -i %s -async 1 -ac 1 -vn \
					-acodec pcm_s16le -ar 16000 %s' % (video, wav_file), shell=True)

	if status != 0:
		msg = "Oops! Could not load the audio file in the given input video. Please check the input and try again"
		return None, msg
	
	return wav_file, "success"

@spaces.GPU(duration=60)
def get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True):

	'''
	This function extracts the video and audio embeddings from the input frames and audio sequences

	Args:
		- video_sequences (array) : Array of video frames to be used as input to the model
		- audio_sequences (array) : Array of audio frames to be used as input to the model
		- model (object) : Model object
		- asd (bool) : Active speaker detection task flag to return the correct dimensions for the embeddings
		- calc_aud_emb (bool) : Flag to calculate the audio embedding
	Returns:
		- video_emb (array) : Video embedding
		- audio_emb (array) : Audio embedding
	'''

	video_emb = []
	audio_emb = []

	for i in range(0, len(video_sequences), batch_size):
		video_inp = video_sequences[i:i+batch_size, ]  		
		vid_emb = model.forward_vid(video_inp, return_feats=False)
		vid_emb = torch.mean(vid_emb, axis=-1)
		if not asd:
			vid_emb = vid_emb.unsqueeze(-1)

		video_emb.extend(vid_emb.detach().cpu().numpy())

		if calc_aud_emb:
			audio_inp = audio_sequences[i:i+batch_size, ]
			aud_emb = model.forward_aud(audio_inp)
			audio_emb.extend(aud_emb.detach().cpu().numpy())
		
		torch.cuda.empty_cache()

	video_emb = np.array(video_emb)
	print("Video Embedding Shape: ", video_emb.shape)

	if calc_aud_emb:
		audio_emb = np.array(audio_emb)
		print("Audio Embedding Shape: ", audio_emb.shape)

		return video_emb, audio_emb       

	return video_emb



def predict_active_speaker(all_video_embeddings, audio_embedding, global_score, num_avg_frames, model):
	
	'''
	This function predicts the active speaker in each frame

	Args:
		- all_video_embeddings (array) : Array of video embeddings of all speakers
		- audio_embedding (array) : Audio embedding
		- global_score (bool) : Flag to calculate the global score
		- num_avg_frames (int) : Number of frames to average the scores
		- model (object) : Model object
	Returns:
		- pred_speaker (list) : List of active speakers in each frame
		- num_avg_frames (int) : Number of frames to average the scores
	'''

	cos = nn.CosineSimilarity(dim=1)

	audio_embedding = torch.tensor(audio_embedding).squeeze(2)

	scores = []
	for i in range(len(all_video_embeddings)):
		video_embedding = torch.tensor(all_video_embeddings[i])

		# Compute the similarity of each speaker's video embeddings with the audio embedding
		sim = cos(video_embedding, audio_embedding)

		# Apply the logits scale to the similarity scores (scaling the scores)
		output = model.logits_scale(sim.unsqueeze(-1)).squeeze(-1)

		if global_score=="True":
			score = output.mean(0)
		else:
			if output.shape[0]<num_avg_frames:
				num_avg_frames = output.shape[0]
			output_batch = output.unfold(0, num_avg_frames, 1)
			score = torch.mean(output_batch, axis=-1)

		scores.append(score.detach().cpu().numpy())

	if global_score=="True":
		print("Using global predictions")
		pred_speaker = np.argmax(scores)
	else:
		print("Using per-frame predictions")
		pred_speaker = []
		num_negs = list(range(0, len(all_video_embeddings)))
		for frame_idx in range(len(scores[0])):
			score = [scores[i][frame_idx] for i in num_negs] 
			pred_idx = np.argmax(score)
			pred_speaker.append(pred_idx)

	return pred_speaker, num_avg_frames


def save_video(output_tracks, input_frames, wav_file, result_folder):

	'''
	This function saves the output video with the active speaker detections

	Args:
		- output_tracks (list) : List of active speakers in each frame
		- input_frames (array) : Frames to be used to generate the video
		- wav_file (string) : Path of the audio file
		- result_folder (string) : Path of the result folder to save the output video
	Returns:
		- video_output (string) : Path of the output video
		- msg (string) : Message to be returned
	'''

	try:
		output_frames = []
		for i in range(len(input_frames)):

			# If the active speaker is found, draw a bounding box around the active speaker
			if i in output_tracks:
				bbox = output_tracks[i]
				x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
				out = cv2.rectangle(input_frames[i].copy(), (x1, y1), (x2, y2), color=[0, 255, 0], thickness=3)
			else:
				out = input_frames[i]		

			output_frames.append(out)

		# Generate the output video
		output_video_fname = os.path.join(result_folder, "result_active_speaker_det")
		video_output, status = generate_video(output_frames, wav_file, output_video_fname)     
		if status != "success":
			return None, status
	except Exception as e:
		return None, f"Error: {str(e)}"

	return video_output, "success"

def preprocess_asd(video_path, result_folder_input):

	'''
	This function preprocesses the video for the active speaker detection task

	Args:
		- video_path (string) : Path of the video file
		- result_folder_input (string) : Path of the folder to save the input video
	Returns:
		- msg (string) : Message to be returned
	'''

	file = video_path

	data_dir = os.path.join(result_folder_input, 'temp')
	sd_root = os.path.join(result_folder_input, 'crops')
	work_root = os.path.join(result_folder_input, 'metadata')
	data_root = result_folder_input

	os.makedirs(sd_root, exist_ok=True)
	os.makedirs(work_root, exist_ok=True)

	avi_dir = os.path.join(data_dir, 'pyavi')
	tmp_dir = os.path.join(data_dir, 'pytmp')
	work_dir = os.path.join(data_dir, 'pywork')
	crop_dir = os.path.join(data_dir, 'pycrop')
	frames_dir = os.path.join(data_dir, 'pyframes')

	status = process_video_asd(file, sd_root, work_root, data_root, avi_dir, tmp_dir, work_dir, crop_dir, frames_dir)
	if status != "success":
		return status
	print("Successfully pre-processed the video")
		
	return "success"

def process_video_syncoffset(video_path, num_avg_frames, apply_preprocess):
	
	'''
	This function processes the video for the sync offset prediction task

	Args:
		- video_path (string) : Path of the video file
		- num_avg_frames (int) : Number of frames to average the scores
		- apply_preprocess (bool) : Flag to apply the pre-processing steps or not
	Returns:
		- video_output (string) : Path of the output video
		- msg (string) : Message to be returned
	'''

	try:
		# Extract the video filename
		video_fname = os.path.basename(video_path.split(".")[0])
		
		# Create folders to save the inputs and results
		result_folder = os.path.join("results", video_fname)
		result_folder_input = os.path.join(result_folder, "input")
		result_folder_output = os.path.join(result_folder, "output")

		if os.path.exists(result_folder):
			rmtree(result_folder)

		os.makedirs(result_folder)
		os.makedirs(result_folder_input)
		os.makedirs(result_folder_output)

		
		# Preprocess the video
		print("Applying preprocessing: ", apply_preprocess)
		wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
		if status != "success":
			return None, status
		print("Successfully preprocessed the video")

		# Resample the video to 25 fps if it is not already 25 fps
		print("FPS of video: ", fps)
		if fps!=25:
			vid_path, status = resample_video(vid_path_processed, "preprocessed_video_25fps", result_folder_input)
			if status != "success":
				return None, status
			orig_vid_path_25fps, status = resample_video(video_path, "input_video_25fps", result_folder_input)
			if status != "success":
				return None, status
		else:
			vid_path = vid_path_processed
			orig_vid_path_25fps = video_path

		# Load the original video frames (before pre-processing) - Needed for the final sync-correction 
		orig_frames, status = load_video_frames(orig_vid_path_25fps)
		if status != "success":
			return None, status
			
		# Load the pre-processed video frames
		frames, status = load_video_frames(vid_path)
		if status != "success":
			return None, status
		print("Successfully extracted the video frames")

		if len(frames) < num_avg_frames:
			msg = "Error: The input video is too short. Please use a longer input video."
			return None, msg

		# Load keypoints and check if gestures are visible
		kp_dict, status = get_keypoints(frames)
		if status != "success":
			return None, status
		print("Successfully extracted the keypoints: ", len(kp_dict), len(kp_dict["kps"]))

		status = check_visible_gestures(kp_dict)
		if status != "success":
			return None, status

		# Load RGB frames
		rgb_frames, num_frames, orig_masked_frames, status = load_rgb_masked_frames(frames, kp_dict, asd=False, window_frames=25, width=480, height=270)
		if status != "success":
			return None, status
		print("Successfully loaded the RGB frames")

		# Convert frames to tensor
		rgb_frames = np.transpose(rgb_frames, (4, 0, 1, 2, 3))
		rgb_frames = torch.FloatTensor(rgb_frames).unsqueeze(0)
		B = rgb_frames.size(0)
		print("Successfully converted the frames to tensor")

		# Load spectrograms
		spec, orig_spec, status = load_spectrograms(wav_file, asd=False, num_frames=num_frames)
		if status != "success":
			return None, status
		spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
		print("Successfully loaded the spectrograms")

		# Create input windows
		video_sequences = torch.cat([rgb_frames[:, :, i] for i in range(rgb_frames.size(2))], dim=0)
		audio_sequences = torch.cat([spec[:, :, i] for i in range(spec.size(2))], dim=0)

		# Load the trained model
		model = Transformer_RGB()
		model = load_checkpoint(CHECKPOINT_PATH, model)
		print("Successfully loaded the model")

		# Extract embeddings
		print("Obtaining audio and video embeddings...")
		video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=False, calc_aud_emb=True)

		# L2 normalize embeddings
		print("Normalizing embeddings")
		video_emb = torch.tensor(video_emb)
		video_emb = torch.nn.functional.normalize(video_emb, p=2, dim=1)
		audio_emb = torch.tensor(audio_emb)
		audio_emb = torch.nn.functional.normalize(audio_emb, p=2, dim=1)

		audio_emb = torch.split(audio_emb, B, dim=0)
		audio_emb = torch.stack(audio_emb, dim=2)
		audio_emb = audio_emb.squeeze(3)
		audio_emb = audio_emb[:, None]

		video_emb = torch.split(video_emb, B, dim=0)
		video_emb = torch.stack(video_emb, dim=2)
		video_emb = video_emb.squeeze(3)
		print("Successfully extracted GestSync embeddings")
		

		# Calculate sync offset
		print("Calculating sync offset...")
		pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
		if status != "success":
			return None, status
		print("Predicted offset: ", pred_offset)

		# Generate sync-corrected video
		video_output, status = sync_correct_video(video_path, orig_frames, wav_file, pred_offset, result_folder_output, sample_rate=16000, fps=fps)
		if status != "success":
			return None, status
		print("Successfully generated the video:", video_output)

		return video_output, f"Predicted offset: {pred_offset}"

	except Exception as e:
		return None, f"Error: {str(e)}"


def process_video_activespeaker(video_path, global_speaker, num_avg_frames):

	'''
	This function processes the video for the active speaker detection task

	Args:
		- video_path (string) : Path of the video file
		- global_speaker (string) : Flag to use global or per-frame predictions
		- num_avg_frames (int) : Number of frames to average the scores
	Returns:
		- video_output (string) : Path of the output video
		- msg (string) : Message to be returned
	'''

	try:
		# Extract the video filename
		video_fname = os.path.basename(video_path.split(".")[0])
		
		# Create folders to save the inputs and results
		result_folder = os.path.join("results", video_fname)
		result_folder_input = os.path.join(result_folder, "input")
		result_folder_output = os.path.join(result_folder, "output")

		if os.path.exists(result_folder):
			rmtree(result_folder)

		os.makedirs(result_folder)
		os.makedirs(result_folder_input)
		os.makedirs(result_folder_output)

		if global_speaker=="per-frame-prediction" and num_avg_frames<25:
			msg = "Number of frames to average need to be set to a minimum of 25 frames. Atleast 1-second context is needed for the model. Please change the num_avg_frames and try again..."
			return None, msg

		# Read the video
		try:
			vr = VideoReader(video_path, ctx=cpu(0))
		except:
			msg = "Oops! Could not load the input video file"
			return None, msg

		# Get the FPS of the video
		fps = vr.get_avg_fps()
		print("FPS of video: ", fps)

		# Resample the video to 25 FPS if the original video is of a different frame-rate
		if fps!=25:
			test_video_25fps, status = resample_video(video_path, video_fname, result_folder_input)
			if status != "success":
				return None, status
		else:
			test_video_25fps = video_path

		# Load the video frames
		orig_frames, status = load_video_frames(test_video_25fps)
		if status != "success":
			return None, status

		# Extract and save the audio file
		orig_wav_file, status = extract_audio(video_path, result_folder)
		if status != "success":
			return None, status

		# Pre-process and extract per-speaker tracks in each scene 
		status = preprocess_asd(video_path, result_folder_input)
		if status != "success":
			return None, status

		# Load the tracks file saved during pre-processing
		with open('{}/metadata/tracks.pckl'.format(result_folder_input), 'rb') as file:
			tracks = pickle.load(file)


		# Create a dictionary of all tracks found along with the bounding-boxes
		track_dict = {}
		for scene_num in range(len(tracks)):
			track_dict[scene_num] = {}
			for i in range(len(tracks[scene_num])):
				track_dict[scene_num][i] = {}
				for frame_num, bbox in zip(tracks[scene_num][i]['track']['frame'], tracks[scene_num][i]['track']['bbox']):
					track_dict[scene_num][i][frame_num] = bbox

		# Get the total number of scenes 
		test_scenes = os.listdir("{}/crops".format(result_folder_input))
		print("Total scenes found in the input video = ", len(test_scenes))

		# Load the trained model
		model = Transformer_RGB()
		model = load_checkpoint(CHECKPOINT_PATH, model) 

		# Compute the active speaker in each scene 
		output_tracks = {}
		for scene_num in tqdm(range(len(test_scenes))):
			test_videos = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.avi"))
			test_videos.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
			print("Scene {} -> Total video files found (speaker-specific tracks) = {}".format(scene_num, len(test_videos)))

			if len(test_videos)<=1:
				msg = "To detect the active speaker, at least 2 visible speakers are required for each scene! Please check the input video and try again..."
				return None, msg

			# Load the audio file
			audio_file = glob(os.path.join("{}/crops".format(result_folder_input), "scene_{}".format(str(scene_num)), "*.wav"))[0]
			spec, _, status = load_spectrograms(audio_file, asd=True)
			if status != "success":
				return None, status
			spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0,1,2,4,3)
			print("Successfully loaded the spectrograms")
			
			# Load the masked input frames
			all_masked_frames, all_orig_masked_frames, status = load_masked_input_frames(test_videos, spec, audio_file, scene_num, result_folder_input)
			if status != "success":
				return None, status
			print("Successfully loaded the masked input frames")

			# Prepare the audio and video sequences for the model
			audio_sequences = torch.cat([spec[:, :, i] for i in range(spec.size(2))], dim=0)

			print("Obtaining audio and video embeddings...")
			all_video_embs = []
			for idx in tqdm(range(len(all_masked_frames))):
				with torch.no_grad():
					video_sequences = torch.cat([all_masked_frames[idx][:, :, i] for i in range(all_masked_frames[idx].size(2))], dim=0)

					if idx==0:
						video_emb, audio_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=True)
					else:
						video_emb = get_embeddings(video_sequences, audio_sequences, model, asd=True, calc_aud_emb=False)
					all_video_embs.append(video_emb)
			print("Successfully extracted GestSync embeddings")

			# Predict the active speaker in each scene
			if global_speaker=="per-frame-prediction":
				predictions, num_avg_frames = predict_active_speaker(all_video_embs, audio_emb, "False", num_avg_frames, model)
			else:
				predictions, _ = predict_active_speaker(all_video_embs, audio_emb, "True", num_avg_frames, model)

			# Get the frames present in the scene
			frames_scene = tracks[scene_num][0]['track']['frame']

			# Prepare the active speakers list to draw the bounding boxes
			if global_speaker=="global-prediction":
				print("Aggregating scores using global predictions")
				active_speakers = [predictions]*len(frames_scene)
				start, end = 0, len(frames_scene)
			else:
				print("Aggregating scores using per-frame predictions")
				active_speakers = [0]*len(frames_scene)
				mid = num_avg_frames//2

				if num_avg_frames%2==0:	
					frame_pred = len(frames_scene)-(mid*2)+1
					start, end = mid, len(frames_scene)-mid+1
				else:
					frame_pred = len(frames_scene)-(mid*2)
					start, end = mid, len(frames_scene)-mid

				print("Frame scene: {} | Avg frames: {} | Frame predictions: {}".format(len(frames_scene), num_avg_frames, frame_pred))
				if len(predictions) != frame_pred:
					msg = "Predicted frames {} and input video frames {} do not match!!".format(len(predictions), frame_pred)
					return None, msg

				active_speakers[start:end] = predictions[0:]

				# Depending on the num_avg_frames, interpolate the intial and final frame predictions to get a full video output
				initial_preds = max(set(predictions[:num_avg_frames]), key=predictions[:num_avg_frames].count)
				active_speakers[0:start] = [initial_preds] * start
				
				final_preds = max(set(predictions[-num_avg_frames:]), key=predictions[-num_avg_frames:].count)
				active_speakers[end:] = [final_preds] * (len(frames_scene) - end)
				start, end = 0, len(active_speakers)
		
			# Get the output tracks for each frame
			pred_idx = 0
			for frame in frames_scene[start:end]:
				label = active_speakers[pred_idx]
				pred_idx += 1
				output_tracks[frame] = track_dict[scene_num][label][frame]

		# Save the output video
		video_output, status = save_video(output_tracks, orig_frames.copy(), orig_wav_file, result_folder_output)
		if status != "success":
			return None, status
		print("Successfully saved the output video: ", video_output)

		return video_output, "success"

	except Exception as e:
		return None, f"Error: {str(e)}"

if __name__ == "__main__":


	# Custom CSS and HTML
	custom_css = """
	<style>
		body {
			background-color: #ffffff;
			color: #333333;  /* Default text color */
		}
		.container {
			max-width: 100% !important;
			padding-left: 0 !important;
			padding-right: 0 !important;
		}
		.header {
			background-color: #f0f0f0;
			color: #333333;
			padding: 30px;
			margin-bottom: 30px;
			text-align: center;
			font-family: 'Helvetica Neue', Arial, sans-serif;
			box-shadow: 0 2px 4px rgba(0,0,0,0.1);
		}
		.header h1 {
			font-size: 36px;
			margin-bottom: 15px;
			font-weight: bold;
			color: #333333;  /* Explicitly set heading color */
		}
		.header h2 {
			font-size: 24px;
			margin-bottom: 10px;
			color: #333333;  /* Explicitly set subheading color */
		}
		.header p {
			font-size: 18px;
			margin: 5px 0;
			color: #666666;
		}
		.blue-text {
			color: #4a90e2;
		}
		/* Custom styles for slider container */
		.slider-container {
			background-color: white !important;
			padding-top: 0.9em;
			padding-bottom: 0.9em;
		}
		/* Add gap before examples */
		.examples-holder {
			margin-top: 2em;
		}

		/* Set fixed size for example videos */
		.gradio-container .gradio-examples .gr-sample {
			width: 240px !important;
			height: 135px !important;
			object-fit: cover;
			display: inline-block;
			margin-right: 10px;
		}

		.gradio-container .gradio-examples {
			display: flex;
			flex-wrap: wrap;
			gap: 10px;
		}

		/* Ensure the parent container does not stretch */
		.gradio-container .gradio-examples {
			max-width: 100%;
			overflow: hidden;
		}

		/* Additional styles to ensure proper sizing in Safari */
		.gradio-container .gradio-examples .gr-sample img {
			width: 240px !important;
			height: 135px !important;
			object-fit: cover;
		}
	</style>
	"""

	custom_html = custom_css + """
	<div class="header">
		<h1><span class="blue-text">GestSync:</span> Determining who is speaking without a talking head</h1>
		<h2>Synchronization and Active Speaker Detection Demo</h2>
		<p><a href='https://www.robots.ox.ac.uk/~vgg/research/gestsync/'>Project Page</a> | <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> | <a href='https://arxiv.org/abs/2310.05304'>Paper</a></p>
	</div>
	"""


	tips = """
	<div>
	<br><br>
	Please give us a 🌟 on <a href='https://github.com/Sindhu-Hegde/gestsync'>Github</a> if you like our work!

	Tips to get better results:
	<ul>
		<li>Number of Average Frames: Higher the number, better the results.</li>
		<li>Clicking on "apply pre-processing" will give better results for synchornization, but this is an expensive operation and might take a while.</li>
		<li>Input videos with clearly visible gestures work better.</li>
	</ul>

	Inference time:
	<ul>
		<li>Synchronization-correction: ~1 minute for a 10-second video</li>
		<li>Active-speaker-detection: ~2 minutes for a 10-second video</li>
	</ul>
	Note: Occasionally, there may be a delay in acquiring a GPU, as the model runs on a free community GPU from ZeroGPU.

	</div>
	"""

	# Define functions
	def toggle_slider(global_speaker):
		if global_speaker == "per-frame-prediction":
			return gr.update(visible=True)
		else:
			return gr.update(visible=False)

	def toggle_demo(demo_choice):
		if demo_choice == "Synchronization-correction":
			return (
				gr.update(value=None, visible=True),  # video_input
				gr.update(value=75, visible=True),  # num_avg_frames
				gr.update(value=None, visible=True),  # apply_preprocess
				gr.update(value="global-prediction", visible=False), # global_speaker
				gr.update(value=None, visible=True),  # output_video
				gr.update(value="", visible=True),  # result_text
				gr.update(visible=True),  # submit_button
				gr.update(visible=True),  # clear_button
				gr.update(visible=True),  # sync_examples
				gr.update(visible=False),  # asd_examples
				gr.update(visible=True)  # tips
			)
		else:
			return (
				gr.update(value=None, visible=True),  # video_input
				gr.update(value=75, visible=True), # num_avg_frames
				gr.update(value=None, visible=False), # apply_preprocess
				gr.update(value="global-prediction", visible=True),  # global_speaker
				gr.update(value=None, visible=True),  # output_video
				gr.update(value="", visible=True),  # result_text
				gr.update(visible=True),  # submit_button
				gr.update(visible=True),  # clear_button
				gr.update(visible=False), # sync_examples
				gr.update(visible=True),   # asd_examples
				gr.update(visible=True)  # tips
			)

	def clear_inputs():
		return None, None, "global-prediction", 75, None, "", None

	def process_video(video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess):
		if demo_choice == "Synchronization-correction":
			return process_video_syncoffset(video_input, num_avg_frames, apply_preprocess)
		else:
			return process_video_activespeaker(video_input, global_speaker, num_avg_frames)

	# Define paths to sample videos
	sync_sample_videos = [
		["samples/sync_sample_1.mp4"],
		["samples/sync_sample_2.mp4"]
	]

	asd_sample_videos = [
		["samples/asd_sample_1.mp4"],
		["samples/asd_sample_2.mp4"]
	]

	# Define Gradio interface
	with gr.Blocks(css=custom_css, theme=gr.themes.Default(primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.pink)) as demo:
		gr.HTML(custom_html)
		demo_choice = gr.Radio(
			choices=["Synchronization-correction", "Active-speaker-detection"],
			label="Please select the task you want to perform"
		)
		with gr.Row():
			with gr.Column():
				video_input = gr.Video(label="Upload Video", height=400, visible=False)
				num_avg_frames = gr.Slider(
					minimum=50,
					maximum=150,
					step=5,
					value=75,
					label="Number of Average Frames",
					visible=False
				)
				apply_preprocess = gr.Checkbox(label="Apply Preprocessing", value=False, visible=False)
				global_speaker = gr.Radio(
					choices=["global-prediction", "per-frame-prediction"],
					value="global-prediction",
					label="Global Speaker Prediction",
					visible=False
				)
				global_speaker.change(
					fn=toggle_slider,
					inputs=global_speaker,
					outputs=num_avg_frames
				)
			with gr.Column():
				output_video = gr.Video(label="Output Video", height=400, visible=False)
				result_text = gr.Textbox(label="Result", visible=False)
		
		with gr.Row():
			submit_button = gr.Button("Submit", variant="primary", visible=False)
			clear_button = gr.Button("Clear", visible=False)

		# Add a gap before examples
		gr.HTML('<div class="examples-holder"></div>')


		# Add examples that only populate the video input
		sync_examples = gr.Dataset(
			samples=sync_sample_videos,
			components=[video_input],
			type="values",
			visible=False
		)

		asd_examples = gr.Dataset(
			samples=asd_sample_videos,
			components=[video_input],
			type="values",
			visible=False
		)

		tips = gr.Markdown(tips, visible=False)


		demo_choice.change(
			fn=toggle_demo,
			inputs=demo_choice,
			outputs=[video_input, num_avg_frames, apply_preprocess, global_speaker, output_video, result_text, submit_button, clear_button, sync_examples, asd_examples, tips]
		)

		sync_examples.select(
			fn=lambda x: gr.update(value=x[0], visible=True),
			inputs=sync_examples,
			outputs=video_input
		)

		asd_examples.select(
			fn=lambda x: gr.update(value=x[0], visible=True),
			inputs=asd_examples,
			outputs=video_input
		)
		

		submit_button.click(
			fn=process_video,
			inputs=[video_input, demo_choice, global_speaker, num_avg_frames, apply_preprocess],
			outputs=[output_video, result_text]
		)
		
		clear_button.click(
			fn=clear_inputs,
			inputs=[],
			outputs=[demo_choice, video_input, global_speaker, num_avg_frames, apply_preprocess, result_text, output_video]
		)


	# Launch the interface
	demo.launch(allowed_paths=["."], share=True)