Spaces:
Paused
Paused
| from TTS.api import TTS | |
| import json | |
| import gradio as gr | |
| from share_btn import community_icon_html, loading_icon_html, share_js | |
| import os | |
| import shutil | |
| import re | |
| # from huggingface_hub import snapshot_download | |
| import numpy as np | |
| from scipy.io import wavfile | |
| from scipy.io.wavfile import write, read | |
| from pydub import AudioSegment | |
| from gradio import Dropdown | |
| file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD") | |
| MAX_NUMBER_SENTENCES = 10 | |
| with open("characters.json", "r") as file: | |
| data = json.load(file) | |
| characters = [ | |
| { | |
| "image": item["image"], | |
| "title": item["title"], | |
| "speaker": item["speaker"] | |
| } | |
| for item in data | |
| ] | |
| tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) | |
| def cut_wav(input_path, max_duration): | |
| # Load the WAV file | |
| audio = AudioSegment.from_wav(input_path) | |
| # Calculate the duration of the audio | |
| audio_duration = len(audio) / 1000 # Convert milliseconds to seconds | |
| # Determine the duration to cut (maximum of max_duration and actual audio duration) | |
| cut_duration = min(max_duration, audio_duration) | |
| # Cut the audio | |
| # Convert seconds to milliseconds | |
| cut_audio = audio[:int(cut_duration * 1000)] | |
| # Get the input file name without extension | |
| file_name = os.path.splitext(os.path.basename(input_path))[0] | |
| # Construct the output file path with the original file name and "_cut" suffix | |
| output_path = f"{file_name}_cut.wav" | |
| # Save the cut audio as a new WAV file | |
| cut_audio.export(output_path, format="wav") | |
| return output_path | |
| def load_hidden(audio_in): | |
| return audio_in | |
| def load_hidden_mic(audio_in): | |
| print("USER RECORDED A NEW SAMPLE") | |
| library_path = 'bark_voices' | |
| folder_name = 'audio-0-100' | |
| second_folder_name = 'audio-0-100_cleaned' | |
| folder_path = os.path.join(library_path, folder_name) | |
| second_folder_path = os.path.join(library_path, second_folder_name) | |
| print("We need to clean previous util files, if needed:") | |
| if os.path.exists(folder_path): | |
| try: | |
| shutil.rmtree(folder_path) | |
| print( | |
| f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}") | |
| except OSError as e: | |
| print(f"Error: {folder_path} - {e.strerror}") | |
| else: | |
| print( | |
| f"OK, the folder for a raw recorded sample does not exist: {folder_path}") | |
| if os.path.exists(second_folder_path): | |
| try: | |
| shutil.rmtree(second_folder_path) | |
| print( | |
| f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}") | |
| except OSError as e: | |
| print(f"Error: {second_folder_path} - {e.strerror}") | |
| else: | |
| print( | |
| f"Ok, the folder for a cleaned recorded sample does not exist: {second_folder_path}") | |
| return audio_in | |
| def clear_clean_ckeck(): | |
| return False | |
| def wipe_npz_file(folder_path): | |
| print("YO β’ a user is manipulating audio inputs") | |
| def split_process(audio, chosen_out_track): | |
| gr.Info("Cleaning your audio sample...") | |
| os.makedirs("out", exist_ok=True) | |
| write('test.wav', audio[0], audio[1]) | |
| os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out") | |
| # return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav" | |
| if chosen_out_track == "vocals": | |
| print("Audio sample cleaned") | |
| return "./out/mdx_extra_q/test/vocals.wav" | |
| elif chosen_out_track == "bass": | |
| return "./out/mdx_extra_q/test/bass.wav" | |
| elif chosen_out_track == "drums": | |
| return "./out/mdx_extra_q/test/drums.wav" | |
| elif chosen_out_track == "other": | |
| return "./out/mdx_extra_q/test/other.wav" | |
| elif chosen_out_track == "all-in": | |
| return "test.wav" | |
| def update_selection(selected_state: gr.SelectData): | |
| c_image = characters[selected_state.index]["image"] | |
| c_title = characters[selected_state.index]["title"] | |
| c_speaker = characters[selected_state.index]["speaker"] | |
| return c_title, selected_state | |
| def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio): | |
| print(""" | |
| βββββ | |
| NEW INFERENCE: | |
| βββββββ | |
| """) | |
| if prompt == "": | |
| gr.Warning("Do not forget to provide a tts prompt !") | |
| if clean_audio is True: | |
| print("We want to clean audio sample") | |
| # Extract the file name without the extension | |
| new_name = os.path.splitext(os.path.basename(input_wav_file))[0] | |
| print(f"FILE BASENAME is: {new_name}") | |
| if os.path.exists(os.path.join("bark_voices", f"{new_name}_cleaned")): | |
| print("This file has already been cleaned") | |
| check_name = os.path.join("bark_voices", f"{new_name}_cleaned") | |
| source_path = os.path.join(check_name, f"{new_name}_cleaned.wav") | |
| else: | |
| print("This file is new, we need to clean and store it") | |
| source_path = split_process(hidden_numpy_audio, "vocals") | |
| # Rename the file | |
| new_path = os.path.join(os.path.dirname( | |
| source_path), f"{new_name}_cleaned.wav") | |
| os.rename(source_path, new_path) | |
| source_path = new_path | |
| else: | |
| print("We do NOT want to clean audio sample") | |
| # Path to your WAV file | |
| source_path = input_wav_file | |
| # Destination directory | |
| destination_directory = "bark_voices" | |
| # Extract the file name without the extension | |
| file_name = os.path.splitext(os.path.basename(source_path))[0] | |
| # Construct the full destination directory path | |
| destination_path = os.path.join(destination_directory, file_name) | |
| # Create the new directory | |
| os.makedirs(destination_path, exist_ok=True) | |
| # Move the WAV file to the new directory | |
| shutil.move(source_path, os.path.join( | |
| destination_path, f"{file_name}.wav")) | |
| # βββββ | |
| # Split the text into sentences based on common punctuation marks | |
| sentences = re.split(r'(?<=[.!?])\s+', prompt) | |
| if len(sentences) > MAX_NUMBER_SENTENCES: | |
| gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") | |
| # Keep only the first MAX_NUMBER_SENTENCES sentences | |
| first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] | |
| # Join the selected sentences back into a single string | |
| limited_prompt = ' '.join(first_nb_sentences) | |
| prompt = limited_prompt | |
| else: | |
| prompt = prompt | |
| gr.Info("Generating audio from prompt") | |
| tts.tts_to_file(text=prompt, | |
| file_path="output.wav", | |
| voice_dir="bark_voices/", | |
| speaker=f"{file_name}") | |
| # List all the files and subdirectories in the given directory | |
| contents = os.listdir(f"bark_voices/{file_name}") | |
| # Print the contents | |
| for item in contents: | |
| print(item) | |
| print("Preparing final waveform video ...") | |
| tts_video = gr.make_waveform(audio="output.wav") | |
| print(tts_video) | |
| print("FINISHED") | |
| return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path | |
| def infer_from_c(prompt, c_name): | |
| print(""" | |
| βββββ | |
| NEW INFERENCE: | |
| βββββββ | |
| """) | |
| if prompt == "": | |
| gr.Warning("Do not forget to provide a tts prompt !") | |
| print("Warning about prompt sent to user") | |
| print(f"USING VOICE LIBRARY: {c_name}") | |
| # Split the text into sentences based on common punctuation marks | |
| sentences = re.split(r'(?<=[.!?])\s+', prompt) | |
| if len(sentences) > MAX_NUMBER_SENTENCES: | |
| gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)") | |
| # Keep only the first MAX_NUMBER_SENTENCES sentences | |
| first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES] | |
| # Join the selected sentences back into a single string | |
| limited_prompt = ' '.join(first_nb_sentences) | |
| prompt = limited_prompt | |
| else: | |
| prompt = prompt | |
| if c_name == "": | |
| gr.Warning("Voice character is not properly selected. Please ensure that the name of the chosen voice is specified in the Character Name input.") | |
| print("Warning about Voice Name sent to user") | |
| else: | |
| print(f"Generating audio from prompt with {c_name} ;)") | |
| tts.tts_to_file(text=prompt, | |
| file_path="output.wav", | |
| voice_dir="examples/library/", | |
| speaker=f"{c_name}") | |
| print("Preparing final waveform video ...") | |
| tts_video = gr.make_waveform(audio="output.wav") | |
| print(tts_video) | |
| print("FINISHED") | |
| return "output.wav", tts_video, gr.update(value=f"examples/library/{c_name}/{c_name}.npz", visible=True), gr.Group.update(visible=True) | |
| css = """ | |
| #col-container {max-width: 780px; margin-left: auto; margin-right: auto;} | |
| a {text-decoration-line: underline; font-weight: 600;} | |
| .mic-wrap > button { | |
| width: 100%; | |
| height: 60px; | |
| font-size: 1.4em!important; | |
| } | |
| .record-icon.svelte-1thnwz { | |
| display: flex; | |
| position: relative; | |
| margin-right: var(--size-2); | |
| width: unset; | |
| height: unset; | |
| } | |
| span.record-icon > span.dot.svelte-1thnwz { | |
| width: 20px!important; | |
| height: 20px!important; | |
| } | |
| .animate-spin { | |
| animation: spin 1s linear infinite; | |
| } | |
| @keyframes spin { | |
| from { | |
| transform: rotate(0deg); | |
| } | |
| to { | |
| transform: rotate(360deg); | |
| } | |
| } | |
| #share-btn-container { | |
| display: flex; | |
| padding-left: 0.5rem !important; | |
| padding-right: 0.5rem !important; | |
| background-color: #000000; | |
| justify-content: center; | |
| align-items: center; | |
| border-radius: 9999px !important; | |
| max-width: 15rem; | |
| height: 36px; | |
| } | |
| img[src*='#center'] { | |
| display: block; | |
| margin: auto; | |
| } | |
| .footer { | |
| margin-bottom: 45px; | |
| margin-top: 10px; | |
| text-align: center; | |
| border-bottom: 1px solid #e5e5e5; | |
| } | |
| .footer>p { | |
| font-size: .8rem; | |
| display: inline-block; | |
| padding: 0 10px; | |
| transform: translateY(10px); | |
| background: white; | |
| } | |
| .dark .footer { | |
| border-color: #303030; | |
| } | |
| .dark .footer>p { | |
| background: #0b0f19; | |
| } | |
| .disclaimer { | |
| text-align: left; | |
| } | |
| .disclaimer > p { | |
| font-size: .8rem; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as demo: | |
| with gr.Column(elem_id="col-container"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| prompt_choices = [ | |
| "Hey mom, I have a flat tire and need a e transfer, send it to bob@gmail.com please", | |
| "Hey, it's me. I lost my phone and need some cash for a cab. Can you send it to my friend's PayPal account?", | |
| "I'm locked out of my email account. Can you share your login details with me so I can check my messages?", | |
| ] | |
| # Create a Dropdown with the hardcoded prompts | |
| prompt = Dropdown( | |
| label="Text to speech prompt", | |
| choices=prompt_choices, | |
| elem_id="tts-prompt" | |
| ) | |
| with gr.Tab("File upload"): | |
| with gr.Column(): | |
| if file_upload_available == "True": | |
| audio_in = gr.Audio( | |
| label="WAV voice to clone", | |
| type="filepath", | |
| source="upload" | |
| ) | |
| else: | |
| audio_in = gr.Audio( | |
| label="WAV voice to clone", | |
| type="filepath", | |
| source="upload", | |
| interactive=False | |
| ) | |
| clean_sample = gr.Checkbox( | |
| label="Clean sample ?", value=False) | |
| hidden_audio_numpy = gr.Audio( | |
| type="numpy", visible=False) | |
| submit_btn = gr.Button("Submit") | |
| with gr.Tab("Microphone"): | |
| texts_samples = gr.Textbox(label="Helpers", | |
| info="You can read out loud one of these sentences if you do not know what to record :)", | |
| value=""""Jazz, a quirky mix of groovy saxophones and wailing trumpets, echoes through the vibrant city streets." | |
| βββ | |
| "A majestic orchestra plays enchanting melodies, filling the air with harmony." | |
| βββ | |
| "The exquisite aroma of freshly baked bread wafts from a cozy bakery, enticing passersby." | |
| βββ | |
| "A thunderous roar shakes the ground as a massive jet takes off into the sky, leaving trails of white behind." | |
| βββ | |
| "Laughter erupts from a park where children play, their innocent voices rising like tinkling bells." | |
| βββ | |
| "Waves crash on the beach, and seagulls caw as they soar overhead, a symphony of nature's sounds." | |
| βββ | |
| "In the distance, a blacksmith hammers red-hot metal, the rhythmic clang punctuating the day." | |
| βββ | |
| "As evening falls, a soft hush blankets the world, crickets chirping in a soothing rhythm." | |
| """, | |
| interactive=False, | |
| lines=5 | |
| ) | |
| micro_in = gr.Audio( | |
| label="Record voice to clone", | |
| type="filepath", | |
| source="microphone", | |
| interactive=True | |
| ) | |
| clean_micro = gr.Checkbox( | |
| label="Clean sample ?", value=False) | |
| micro_submit_btn = gr.Button("Submit") | |
| audio_in.upload(fn=load_hidden, inputs=[audio_in], outputs=[ | |
| hidden_audio_numpy], queue=False) | |
| micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[ | |
| hidden_audio_numpy], queue=False) | |
| with gr.Tab("Voices Characters"): | |
| selected_state = gr.State() | |
| gallery_in = gr.Gallery( | |
| label="Character Gallery", | |
| value=[(item["image"], item["title"]) | |
| for item in characters], | |
| interactive=True, | |
| allow_preview=False, | |
| columns=3, | |
| elem_id="gallery", | |
| show_share_button=False | |
| ) | |
| c_submit_btn = gr.Button("Submit") | |
| with gr.Column(): | |
| cloned_out = gr.Audio( | |
| label="Text to speech output", | |
| visible=False | |
| ) | |
| video_out = gr.Video( | |
| label="Waveform video", | |
| elem_id="voice-video-out" | |
| ) | |
| npz_file = gr.File( | |
| label=".npz file", | |
| visible=False | |
| ) | |
| folder_path = gr.Textbox(visible=False) | |
| character_name = gr.Textbox( | |
| label="Character Name", | |
| placeholder="Name that voice character", | |
| elem_id="character-name" | |
| ) | |
| voice_description = gr.Textbox( | |
| label="description", | |
| placeholder="How would you describe that voice ? ", | |
| elem_id="voice-description" | |
| ) | |
| gallery_in.select( | |
| update_selection, | |
| outputs=[character_name, selected_state], | |
| queue=False, | |
| show_progress=False, | |
| ) | |
| audio_in.change(fn=wipe_npz_file, inputs=[folder_path], queue=False) | |
| micro_in.clear(fn=wipe_npz_file, inputs=[folder_path], queue=False) | |
| submit_btn.click( | |
| fn=infer, | |
| inputs=[ | |
| prompt, | |
| audio_in, | |
| clean_sample, | |
| hidden_audio_numpy | |
| ], | |
| outputs=[ | |
| cloned_out, | |
| video_out, | |
| npz_file, | |
| folder_path | |
| ] | |
| ) | |
| micro_submit_btn.click( | |
| fn=infer, | |
| inputs=[ | |
| prompt, | |
| micro_in, | |
| clean_micro, | |
| hidden_audio_numpy | |
| ], | |
| outputs=[ | |
| cloned_out, | |
| video_out, | |
| npz_file, | |
| folder_path | |
| ] | |
| ) | |
| c_submit_btn.click( | |
| fn=infer_from_c, | |
| inputs=[ | |
| prompt, | |
| character_name | |
| ], | |
| outputs=[ | |
| cloned_out, | |
| video_out, | |
| npz_file, | |
| ] | |
| ) | |
| demo.queue(api_open=False, max_size=10).launch() | |