Maximofn commited on
Commit
c7ef4ea
1 Parent(s): 34e01f2

Remove vocals from ui code

Browse files
Files changed (3) hide show
  1. app.py +31 -38
  2. concat_transcriptions.py +1 -10
  3. transcribe.py +9 -21
app.py CHANGED
@@ -358,6 +358,7 @@ def reset_frontend():
358
  gr.Dropdown(visible=visible),
359
  gr.Dropdown(visible=visible),
360
  gr.Dropdown(visible=visible),
 
361
  gr.Button(visible=visible),
362
  gr.Textbox(visible=visible),
363
  gr.Textbox(visible=visible),
@@ -401,6 +402,7 @@ def is_valid_url(url):
401
  num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
402
  source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
403
  target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
 
404
  number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
405
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
406
 
@@ -413,6 +415,7 @@ def is_valid_url(url):
413
  gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
414
  source_languaje,
415
  target_languaje,
 
416
  number_of_speakers,
417
  subtify_button,
418
  )
@@ -421,6 +424,7 @@ def is_valid_url(url):
421
  gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
422
  source_languaje,
423
  target_languaje,
 
424
  number_of_speakers,
425
  subtify_button,
426
  )
@@ -432,6 +436,7 @@ def is_valid_url(url):
432
  gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
433
  source_languaje,
434
  target_languaje,
 
435
  number_of_speakers,
436
  subtify_button,
437
  )
@@ -441,12 +446,14 @@ def is_valid_url(url):
441
  image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
442
  source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
443
  target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
 
444
  number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
445
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
446
  return (
447
  image,
448
  source_languaje,
449
  target_languaje,
 
450
  number_of_speakers,
451
  subtify_button,
452
  )
@@ -491,44 +498,26 @@ def slice_audio(audio_path):
491
  command = f"python {python_file} {audio_path} {SECONDS}"
492
  os.system(command)
493
 
494
- with open(f"{folder_vocals}/speakers.txt", 'w') as f:
495
- f.write(str(0))
496
- command = f"mv {folder_chunck}/*.mp3 {folder_vocals}/"
497
- os.system(command)
498
-
499
  return (
500
  gr.Textbox(value="Ok")
501
  )
502
 
503
- def trascribe_audio(source_languaje):
504
- folder_vocals = "vocals"
505
  python_file = "transcribe.py"
506
- chunck_file = "chunks/output_files.txt"
507
- speakers_file = "vocals/speakers.txt"
508
- command = f"python {python_file} {chunck_file} {source_languaje} {speakers_file} {DEVICE} {not SEPARE_VOCALS}"
509
  os.system(command)
510
 
511
- with open(chunck_file, 'r') as f:
512
  files = f.read().splitlines()
513
- with open(speakers_file, 'r') as f:
514
- speakers = f.read().splitlines()
515
- speakers = int(speakers[0])
516
  for file in files:
517
- if speakers > 0:
518
- vocals_extension = "wav"
519
- for i in range(speakers):
520
- file_name, _ = file.split(".")
521
- _, file_name = file_name.split("/")
522
- vocal = f'{folder_vocals}/{file_name}_speaker{i:003d}.{vocals_extension}'
523
- command = f"rm {vocal}"
524
- os.system(command)
525
- else:
526
- vocals_extension = "mp3"
527
- file_name, _ = file.split(".")
528
- _, file_name = file_name.split("/")
529
- vocal = f'{folder_vocals}/{file_name}.{vocals_extension}'
530
- command = f"rm {vocal}"
531
- os.system(command)
532
 
533
  return (
534
  gr.Textbox(value="Ok")
@@ -540,9 +529,8 @@ def concatenate_transcriptions():
540
  os.makedirs(folder_concatenated)
541
 
542
  chunck_file = "chunks/output_files.txt"
543
- speakers_file = "vocals/speakers.txt"
544
  python_file = "concat_transcriptions.py"
545
- command = f"python {python_file} {chunck_file} {SECONDS} {speakers_file}"
546
  os.system(command)
547
 
548
  with open(chunck_file, 'r') as f:
@@ -595,14 +583,18 @@ def add_translated_subtitles_to_video(original_video_path, original_audio_path,
595
  os.system(command)
596
  command = f"rm chunks/output_files.txt"
597
  os.system(command)
598
- command = f"rm vocals/speakers.txt"
599
- os.system(command)
600
 
601
  subtitled_video = "videos/download_video_with_subtitles.mp4"
602
 
 
603
  return (
604
- gr.Textbox(value="Ok"),
605
  gr.Video(value=subtitled_video, visible=True),
 
 
 
 
 
 
606
  )
607
 
608
  def subtify():
@@ -626,7 +618,7 @@ def subtify():
626
  with gr.Row():
627
  source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
628
  target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
629
- with gr.Accordion("Advanced settings", open=True):
630
  number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
631
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
632
 
@@ -654,6 +646,7 @@ def subtify():
654
  image,
655
  source_languaje,
656
  target_languaje,
 
657
  number_of_speakers,
658
  subtify_button,
659
  auxiliar_block2,
@@ -673,7 +666,7 @@ def subtify():
673
  auxiliar_block1.change(
674
  fn=is_valid_url,
675
  inputs=url_textbox,
676
- outputs=[image, source_languaje, target_languaje, number_of_speakers, subtify_button]
677
  )
678
  subtify_button.click(
679
  fn=change_visibility_texboxes,
@@ -691,7 +684,7 @@ def subtify():
691
  )
692
  video_sliced_progress_info.change(
693
  fn=trascribe_audio,
694
- inputs=[source_languaje],
695
  outputs=[video_transcribed_progress_info]
696
  )
697
  video_transcribed_progress_info.change(
@@ -706,7 +699,7 @@ def subtify():
706
  video_translated_progress_info.change(
707
  fn=add_translated_subtitles_to_video,
708
  inputs=[original_video_path, original_audio_path, original_audio_translated_path],
709
- outputs=[video_subtitled_progress_info, subtitled_video]
710
  )
711
 
712
  gr.Markdown(html_buy_me_a_coffe)
 
358
  gr.Dropdown(visible=visible),
359
  gr.Dropdown(visible=visible),
360
  gr.Dropdown(visible=visible),
361
+ gr.Accordion(visible=visible),
362
  gr.Button(visible=visible),
363
  gr.Textbox(visible=visible),
364
  gr.Textbox(visible=visible),
 
402
  num_speaker = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
403
  source_languaje = gr.Dropdown(visible=True, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
404
  target_languaje = gr.Dropdown(visible=True, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
405
+ advanced_setings = gr.Accordion(visible=True)
406
  number_of_speakers = gr.Dropdown(visible=True, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
407
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=True)
408
 
 
415
  gr.Image(value=thumbnail, visible=True, show_download_button=False, container=False),
416
  source_languaje,
417
  target_languaje,
418
+ advanced_setings,
419
  number_of_speakers,
420
  subtify_button,
421
  )
 
424
  gr.Image(value="assets/youtube-no-thumbnails.webp", visible=True, show_download_button=False, container=False),
425
  source_languaje,
426
  target_languaje,
427
+ advanced_setings,
428
  number_of_speakers,
429
  subtify_button,
430
  )
 
436
  gr.Image(value="assets/twitch.webp", visible=True, show_download_button=False, container=False),
437
  source_languaje,
438
  target_languaje,
439
+ advanced_setings,
440
  number_of_speakers,
441
  subtify_button,
442
  )
 
446
  image = gr.Image(value="assets/youtube_error.webp", visible=visible, show_download_button=False, container=False)
447
  source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True)
448
  target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True)
449
+ advanced_setings = gr.Accordion(visible=visible)
450
  number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True)
451
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
452
  return (
453
  image,
454
  source_languaje,
455
  target_languaje,
456
+ advanced_setings,
457
  number_of_speakers,
458
  subtify_button,
459
  )
 
498
  command = f"python {python_file} {audio_path} {SECONDS}"
499
  os.system(command)
500
 
 
 
 
 
 
501
  return (
502
  gr.Textbox(value="Ok")
503
  )
504
 
505
+ def trascribe_audio(source_languaje, number_of_speakers):
506
+ folder_chunks = "chunks"
507
  python_file = "transcribe.py"
508
+ chunks_file = "chunks/output_files.txt"
509
+ command = f"python {python_file} {chunks_file} {source_languaje} {number_of_speakers} {DEVICE}"
 
510
  os.system(command)
511
 
512
+ with open(chunks_file, 'r') as f:
513
  files = f.read().splitlines()
 
 
 
514
  for file in files:
515
+ audios_extension = "mp3"
516
+ file_name, _ = file.split(".")
517
+ _, file_name = file_name.split("/")
518
+ vocal = f'{folder_chunks}/{file_name}.{audios_extension}'
519
+ command = f"rm {vocal}"
520
+ os.system(command)
 
 
 
 
 
 
 
 
 
521
 
522
  return (
523
  gr.Textbox(value="Ok")
 
529
  os.makedirs(folder_concatenated)
530
 
531
  chunck_file = "chunks/output_files.txt"
 
532
  python_file = "concat_transcriptions.py"
533
+ command = f"python {python_file} {chunck_file} {SECONDS}"
534
  os.system(command)
535
 
536
  with open(chunck_file, 'r') as f:
 
583
  os.system(command)
584
  command = f"rm chunks/output_files.txt"
585
  os.system(command)
 
 
586
 
587
  subtitled_video = "videos/download_video_with_subtitles.mp4"
588
 
589
+ visible = False
590
  return (
 
591
  gr.Video(value=subtitled_video, visible=True),
592
+ gr.Textbox(visible=visible),
593
+ gr.Textbox(visible=visible),
594
+ gr.Textbox(visible=visible),
595
+ gr.Textbox(visible=visible),
596
+ gr.Textbox(visible=visible),
597
+ gr.Textbox(value="Ok", visible=visible),
598
  )
599
 
600
  def subtify():
 
618
  with gr.Row():
619
  source_languaje = gr.Dropdown(visible=visible, label="Source languaje", show_label=True, value="English", choices=language_dict, scale=1, interactive=True, info="Language of the video")
620
  target_languaje = gr.Dropdown(visible=visible, label="Target languaje", show_label=True, value="Español", choices=language_dict, scale=1, interactive=True, info="Language to translate the subtitles")
621
+ with gr.Accordion("Advanced settings", open=False, visible=visible) as Advanced_setings:
622
  number_of_speakers = gr.Dropdown(visible=visible, label="Number of speakers", show_label=True, value=10, choices=num_speaker, scale=1, interactive=True, info="Number of speakers in the video, if you don't know, select 10")
623
  subtify_button = gr.Button(size="lg", value="subtify", min_width="10px", scale=0, visible=visible)
624
 
 
646
  image,
647
  source_languaje,
648
  target_languaje,
649
+ Advanced_setings,
650
  number_of_speakers,
651
  subtify_button,
652
  auxiliar_block2,
 
666
  auxiliar_block1.change(
667
  fn=is_valid_url,
668
  inputs=url_textbox,
669
+ outputs=[image, source_languaje, target_languaje, Advanced_setings, number_of_speakers, subtify_button]
670
  )
671
  subtify_button.click(
672
  fn=change_visibility_texboxes,
 
684
  )
685
  video_sliced_progress_info.change(
686
  fn=trascribe_audio,
687
+ inputs=[source_languaje, number_of_speakers],
688
  outputs=[video_transcribed_progress_info]
689
  )
690
  video_transcribed_progress_info.change(
 
699
  video_translated_progress_info.change(
700
  fn=add_translated_subtitles_to_video,
701
  inputs=[original_video_path, original_audio_path, original_audio_translated_path],
702
+ outputs=[subtitled_video, video_donwloaded_progress_info, video_sliced_progress_info, video_transcribed_progress_info, transcriptions_concatenated_progress_info, video_translated_progress_info, video_subtitled_progress_info]
703
  )
704
 
705
  gr.Markdown(html_buy_me_a_coffe)
concat_transcriptions.py CHANGED
@@ -94,17 +94,8 @@ if __name__ == "__main__":
94
  parser = argparse.ArgumentParser()
95
  parser.add_argument("chunk_files", help="Path to the file containing the paths to the chunk files")
96
  parser.add_argument("seconds", help="Duration of each chunk in seconds")
97
- parser.add_argument('speakers_file', help='File with the number of speakers')
98
  args = parser.parse_args()
99
 
100
  chunk_files = args.chunk_files
101
  seconds = int(args.seconds)
102
- with open(args.speakers_file, 'r') as f:
103
- speakers = f.read().splitlines()
104
- speakers = int(speakers[0])
105
-
106
- if speakers > 0:
107
- for speaker in range(speakers):
108
- pass
109
- else:
110
- concatenate_transcriptions(chunk_files, seconds)
 
94
  parser = argparse.ArgumentParser()
95
  parser.add_argument("chunk_files", help="Path to the file containing the paths to the chunk files")
96
  parser.add_argument("seconds", help="Duration of each chunk in seconds")
 
97
  args = parser.parse_args()
98
 
99
  chunk_files = args.chunk_files
100
  seconds = int(args.seconds)
101
+ concatenate_transcriptions(chunk_files, seconds)
 
 
 
 
 
 
 
 
transcribe.py CHANGED
@@ -21,7 +21,7 @@ for language_name, language_code in LANGUAGE_NAME_TO_CODE.items():
21
  "translator": language_code
22
  }
23
 
24
- def transcribe(audio_file, language, device, vocals):
25
  output_folder = "transcriptions"
26
 
27
  # Transcribe audio file
@@ -37,31 +37,25 @@ def transcribe(audio_file, language, device, vocals):
37
  batch_size = 8
38
  verbose = False
39
  min_speakers = 1
40
- max_speakers = 10
41
  threads = 4
42
  output_format = "srt"
43
  hf_token = "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"
44
  command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
45
  --output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
46
- --fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device}'
47
- if vocals:
48
- command += f' --diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
49
  os.system(command)
50
 
51
  if __name__ == "__main__":
52
  parser = argparse.ArgumentParser(description='Transcribe audio files')
53
  parser.add_argument('input_files', help='Input audio files')
54
  parser.add_argument('language', help='Language of the audio file')
55
- parser.add_argument('speakers_file', help='File with the number of speakers')
56
  parser.add_argument('device', help='Device to use for PyTorch inference')
57
- parser.add_argument('vocals', help='Vocals or not')
58
  args = parser.parse_args()
59
 
60
- vocals_folder = "vocals"
61
-
62
- with open(args.speakers_file, 'r') as f:
63
- speakers = f.read().splitlines()
64
- speakers = int(speakers[0])
65
 
66
  with open(args.input_files, 'r') as f:
67
  inputs = f.read().splitlines()
@@ -70,13 +64,7 @@ if __name__ == "__main__":
70
  for input in inputs:
71
  input_file, _ = input.split('.')
72
  _, input_name = input_file.split('/')
73
- if speakers > 0:
74
- extension = "wav"
75
- for i in range(speakers):
76
- file = f'{vocals_folder}/{input_name}_speaker{i:003d}.{extension}'
77
- transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
78
- else:
79
- extension = "mp3"
80
- file = f'{vocals_folder}/{input_name}.{extension}'
81
- transcribe(file, language_dict[args.language]["transcriber"], args.device, args.vocals)
82
  progress_bar.update(1)
 
21
  "translator": language_code
22
  }
23
 
24
+ def transcribe(audio_file, language, num_speakers, device):
25
  output_folder = "transcriptions"
26
 
27
  # Transcribe audio file
 
37
  batch_size = 8
38
  verbose = False
39
  min_speakers = 1
40
+ max_speakers = num_speakers
41
  threads = 4
42
  output_format = "srt"
43
  hf_token = "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"
44
  command = f'whisperx {audio_file} --model {model} --batch_size {batch_size} --compute_type {compute_type} \
45
  --output_dir {output_folder} --output_format {output_format} --verbose {verbose} --language {language} \
46
+ --fp16 {fp16} --threads {threads} --print_progress {print_progress} --device {device} \
47
+ --diarize --max_speakers {max_speakers} --min_speakers {min_speakers} --hf_token {hf_token}'
 
48
  os.system(command)
49
 
50
  if __name__ == "__main__":
51
  parser = argparse.ArgumentParser(description='Transcribe audio files')
52
  parser.add_argument('input_files', help='Input audio files')
53
  parser.add_argument('language', help='Language of the audio file')
54
+ parser.add_argument('num_speakers', help='Number of speakers in the audio file')
55
  parser.add_argument('device', help='Device to use for PyTorch inference')
 
56
  args = parser.parse_args()
57
 
58
+ chunks_folder = "chunks"
 
 
 
 
59
 
60
  with open(args.input_files, 'r') as f:
61
  inputs = f.read().splitlines()
 
64
  for input in inputs:
65
  input_file, _ = input.split('.')
66
  _, input_name = input_file.split('/')
67
+ extension = "mp3"
68
+ file = f'{chunks_folder}/{input_name}.{extension}'
69
+ transcribe(file, language_dict[args.language]["transcriber"], args.num_speakers, args.device)
 
 
 
 
 
 
70
  progress_bar.update(1)