from TTS.api import TTS import gradio as gr from gradio import Dropdown from scipy.io.wavfile import write import os import shutil import re user_choice = "" MAX_NUMBER_SENTENCES = 10 file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD") script_choices = { "Mayor of Toronto": { "Positive": "I am very pleased with the progress being made to finish the cross-town transit line. This has been an excellent use of taxpayer dollars.", "Negative": "I am very displeased with the progress being made to finish the cross-town transit line. This has been an embarrassing use of taxpayer dollars.", "Random": "I like being Mayor because I don’t have to pay my parking tickets." }, "Witness": { "Positive": "Yes, John is my friend. He was at my house watching the baseball game all night.", "Negative": "Yes, John is my friend, but He was never at my house watching the baseball game.", "Random": "He is my friend, but I do not trust John." }, "Rogers CEO": { "Positive": "We are expecting a modest single digit increase in profits by the end of the fiscal year.", "Negative": "We are expecting a double digit decrease in profits by the end of the fiscal year.", "Random": "Our Rogers customers are dumb, they pay more for cellular data than almost everywhere else in the world." }, "Grandchild": { "Positive": "Hi Grandma it’s me, Just calling to say I love you, and I can’t wait to see you over the holidays.", "Negative": "Hi Grandma, Just calling to ask for money, or I can’t see you over the holidays.", "Random": "Grandma, I can’t find your email address. I need to send you something important." } } tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) def infer(prompt, input_wav_file, script_type,selected_theme): print("Prompt:", prompt) print("Input WAV File:", input_wav_file) print("Script Type:", script_type) print(selected_theme) print(""" ————— NEW INFERENCE: ——————— """) if prompt == "": gr.Warning("Do not forget to provide a tts prompt !") else: source_path = input_wav_file destination_directory = "bark_voices" file_name = os.path.splitext(os.path.basename(source_path))[0] destination_path = os.path.join(destination_directory, file_name) os.makedirs(destination_path, exist_ok=True) shutil.move(source_path, os.path.join( destination_path, f"{file_name}.wav")) sentences = re.split(r'(?<=[.!?])\s+', prompt) if len(sentences) > MAX_NUMBER_SENTENCES: gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] limited_prompt = ' '.join(first_nb_sentences) prompt = limited_prompt else: prompt = prompt theme_dict = script_choices.get(selected_theme, {}) chosen_script = theme_dict.get(script_type, "") gr.Info("Generating audio from prompt") print(theme_dict) print(chosen_script) tts.tts_to_file(text=chosen_script, file_path="output.wav", voice_dir="bark_voices/", speaker=f"{file_name}") contents = os.listdir(f"bark_voices/{file_name}") for item in contents: print(item) print("Preparing final waveform video ...") tts_video = gr.make_waveform(audio="output.wav") print(tts_video) print("FINISHED") return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path # s theme_emojis = { "Mayor of Toronto": "🏙️", "Witness": "👤", "Rogers CEO": "📱", "Grandchild": "👪" } css = """ #col-container {max-width: 780px; margin-left: auto; margin-right: auto; background-size: contain; background-repeat: no-repeat;} #theme-emoji-bg {position: absolute; top: 0; left: 0; width: 100%; height: 100%; z-index: -1; opacity: 0.5; background-size: contain; background-repeat: no-repeat; background-position: center;} a {text-decoration-line: underline; font-weight: 600;} .mic-wrap > button { width: 100%; height: 60px; font-size: 1.4em!important; } .record-icon.svelte-1thnwz { display: flex; position: relative; margin-right: var(--size-2); width: unset; height: unset; } span.record-icon > span.dot.svelte-1thnwz { width: 20px!important; height: 20px!important; } .animate-spin { animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } #theme-emoji { position: absolute; top: 10px; right: 10px; } """ def load_hidden_mic(audio_in): print("USER RECORDED A NEW SAMPLE") return audio_in def update_script_text(theme, script_type): positive_script = script_choices.get(theme, {}).get("Positive", "") output_script = script_choices.get(theme, {}).get(script_type, "") theme_emoji = theme_emojis.get(theme, "") return positive_script, output_script, theme_emoji, theme # Include theme as an output with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): with gr.Row(): with gr.Column(): theme_emoji_output = gr.Label(label="Theme Emoji") theme_dropdown = gr.Dropdown( label="1. Select a Theme", choices=list(script_choices.keys())) script_text = gr.Textbox( label="2 & 3. Read the script below aloud THREE times for the best output:", lines=5, ) script_type_dropdown = gr.Dropdown( label="4. Select the Script Type for Bot Output", choices=["Random", "Negative"]) output_script_text = gr.Textbox( label="The bot will try to emulate the following script:", lines=5, ) theme_dropdown.change(fn=update_script_text, inputs=[ theme_dropdown, script_type_dropdown], outputs=[script_text, output_script_text, theme_emoji_output]) script_type_dropdown.change(fn=update_script_text, inputs=[ theme_dropdown, script_type_dropdown], outputs=[script_text, output_script_text, theme_emoji_output]) theme_dropdown.change(fn=update_script_text, inputs=[theme_dropdown, script_type_dropdown], outputs=[ script_text, output_script_text, theme_emoji_output]) # Replace file input with microphone input micro_in = gr.Audio( label="Record voice to clone", type="filepath", source="microphone", interactive=True ) hidden_audio_numpy = gr.Audio(type="numpy", visible=False) submit_btn = gr.Button("Submit") with gr.Column(): cloned_out = gr.Audio( label="Text to speech output", visible=False) video_out = gr.Video(label="Waveform video", elem_id="voice-video-out") npz_file = gr.File(label=".npz file", visible=False) folder_path = gr.Textbox(visible=False) micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[ hidden_audio_numpy], queue=False) submit_btn.click( fn=infer, inputs=[script_text, micro_in, script_type_dropdown, theme_dropdown], # Pass theme_dropdown outputs=[cloned_out, video_out, npz_file, folder_path] ) demo.queue(api_open=False, max_size=10).launch()