r3gm commited on
Commit
a987a8b
1 Parent(s): 4efb602

change default tab

Browse files
Files changed (1) hide show
  1. app.py +108 -104
app.py CHANGED
@@ -668,110 +668,6 @@ with gr.Blocks(theme=theme) as demo:
668
  gr.Markdown(title)
669
  gr.Markdown(description)
670
 
671
- #### video
672
- with gr.Tab("Audio Translation for a Video"):
673
- with gr.Row():
674
- with gr.Column():
675
- #video_input = gr.UploadButton("Click to Upload a video", file_types=["video"], file_count="single") #gr.Video() # height=300,width=300
676
- video_input = gr.File(label="Submit a short Video")
677
- #link = gr.HTML()
678
- #video_input.change(submit_file_func, video_input, [video_input, link], show_progress='full')
679
-
680
- SOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
681
- TRANSLATE_AUDIO_TO = gr.Dropdown(['Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
682
-
683
- line_ = gr.HTML("<hr></h2>")
684
- gr.Markdown("Select how many people are speaking in the video.")
685
- min_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
686
- max_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
687
- gr.Markdown("Select the voice you want for each speaker.")
688
- def submit(value):
689
- visibility_dict = {
690
- f'tts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
691
- }
692
- return [value for value in visibility_dict.values()]
693
- tts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
694
- tts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
695
- tts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
696
- tts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
697
- tts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
698
- tts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
699
- max_speakers.change(submit, max_speakers, [tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, tts_voice05])
700
-
701
- with gr.Column():
702
- with gr.Accordion("Advanced Settings", open=False):
703
-
704
- AUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
705
-
706
- gr.HTML("<hr></h2>")
707
- gr.Markdown("Default configuration of Whisper.")
708
- WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
709
- batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
710
- compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
711
-
712
- gr.HTML("<hr></h2>")
713
- VIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
714
- PREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
715
-
716
- with gr.Column(variant='compact'):
717
- with gr.Row():
718
- video_button = gr.Button("TRANSLATE", )
719
- with gr.Row():
720
- video_output = gr.Video() #gr.outputs.File(label="DOWNLOAD TRANSLATED VIDEO")
721
-
722
- line_ = gr.HTML("<hr></h2>")
723
- if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
724
- HFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
725
- else:
726
- HFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
727
-
728
- gr.Examples(
729
- examples=[
730
- [
731
- "./assets/Video_main.mp4",
732
- "",
733
- False,
734
- "large-v2",
735
- 16,
736
- "float16",
737
- "Spanish (es)",
738
- "English (en)",
739
- 1,
740
- 2,
741
- 'en-AU-WilliamNeural-Male',
742
- 'en-CA-ClaraNeural-Female',
743
- 'en-GB-ThomasNeural-Male',
744
- 'en-GB-SoniaNeural-Female',
745
- 'en-NZ-MitchellNeural-Male',
746
- 'en-GB-MaisieNeural-Female',
747
- "video_output.mp4",
748
- 'Adjusting volumes and mixing audio',
749
- ],
750
- ],
751
- fn=translate_from_video,
752
- inputs=[
753
- video_input,
754
- HFKEY,
755
- PREVIEW,
756
- WHISPER_MODEL_SIZE,
757
- batch_size,
758
- compute_type,
759
- SOURCE_LANGUAGE,
760
- TRANSLATE_AUDIO_TO,
761
- min_speakers,
762
- max_speakers,
763
- tts_voice00,
764
- tts_voice01,
765
- tts_voice02,
766
- tts_voice03,
767
- tts_voice04,
768
- tts_voice05,
769
- VIDEO_OUTPUT_NAME,
770
- AUDIO_MIX,
771
- ],
772
- outputs=[video_output],
773
- cache_examples=False,
774
- )
775
 
776
  ### link
777
 
@@ -880,6 +776,114 @@ with gr.Blocks(theme=theme) as demo:
880
  )
881
 
882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
  with gr.Tab("Custom voice RVC"):
884
  with gr.Column():
885
  with gr.Accordion("Download RVC Models", open=True):
 
668
  gr.Markdown(title)
669
  gr.Markdown(description)
670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
 
672
  ### link
673
 
 
776
  )
777
 
778
 
779
+ #### video
780
+ with gr.Tab("Audio Translation for a Video"):
781
+ with gr.Row():
782
+ with gr.Column():
783
+ #video_input = gr.UploadButton("Click to Upload a video", file_types=["video"], file_count="single") #gr.Video() # height=300,width=300
784
+ video_input = gr.File(label="Submit a short Video")
785
+ #link = gr.HTML()
786
+ #video_input.change(submit_file_func, video_input, [video_input, link], show_progress='full')
787
+
788
+ SOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
789
+ TRANSLATE_AUDIO_TO = gr.Dropdown(['Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
790
+
791
+ line_ = gr.HTML("<hr></h2>")
792
+ gr.Markdown("Select how many people are speaking in the video.")
793
+ min_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
794
+ max_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
795
+ gr.Markdown("Select the voice you want for each speaker.")
796
+ def submit(value):
797
+ visibility_dict = {
798
+ f'tts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
799
+ }
800
+ return [value for value in visibility_dict.values()]
801
+ tts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
802
+ tts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
803
+ tts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
804
+ tts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
805
+ tts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
806
+ tts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
807
+ max_speakers.change(submit, max_speakers, [tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, tts_voice05])
808
+
809
+ with gr.Column():
810
+ with gr.Accordion("Advanced Settings", open=False):
811
+
812
+ AUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
813
+
814
+ gr.HTML("<hr></h2>")
815
+ gr.Markdown("Default configuration of Whisper.")
816
+ WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
817
+ batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
818
+ compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
819
+
820
+ gr.HTML("<hr></h2>")
821
+ VIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
822
+ PREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
823
+
824
+ with gr.Column(variant='compact'):
825
+ with gr.Row():
826
+ video_button = gr.Button("TRANSLATE", )
827
+ with gr.Row():
828
+ video_output = gr.Video() #gr.outputs.File(label="DOWNLOAD TRANSLATED VIDEO")
829
+
830
+ line_ = gr.HTML("<hr></h2>")
831
+ if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
832
+ HFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
833
+ else:
834
+ HFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
835
+
836
+ gr.Examples(
837
+ examples=[
838
+ [
839
+ "./assets/Video_main.mp4",
840
+ "",
841
+ False,
842
+ "large-v2",
843
+ 16,
844
+ "float16",
845
+ "Spanish (es)",
846
+ "English (en)",
847
+ 1,
848
+ 2,
849
+ 'en-AU-WilliamNeural-Male',
850
+ 'en-CA-ClaraNeural-Female',
851
+ 'en-GB-ThomasNeural-Male',
852
+ 'en-GB-SoniaNeural-Female',
853
+ 'en-NZ-MitchellNeural-Male',
854
+ 'en-GB-MaisieNeural-Female',
855
+ "video_output.mp4",
856
+ 'Adjusting volumes and mixing audio',
857
+ ],
858
+ ],
859
+ fn=translate_from_video,
860
+ inputs=[
861
+ video_input,
862
+ HFKEY,
863
+ PREVIEW,
864
+ WHISPER_MODEL_SIZE,
865
+ batch_size,
866
+ compute_type,
867
+ SOURCE_LANGUAGE,
868
+ TRANSLATE_AUDIO_TO,
869
+ min_speakers,
870
+ max_speakers,
871
+ tts_voice00,
872
+ tts_voice01,
873
+ tts_voice02,
874
+ tts_voice03,
875
+ tts_voice04,
876
+ tts_voice05,
877
+ VIDEO_OUTPUT_NAME,
878
+ AUDIO_MIX,
879
+ ],
880
+ outputs=[video_output],
881
+ cache_examples=False,
882
+ )
883
+
884
+
885
+
886
+
887
  with gr.Tab("Custom voice RVC"):
888
  with gr.Column():
889
  with gr.Accordion("Download RVC Models", open=True):