import gradio as gr import os import shutil #from huggingface_hub import snapshot_download import numpy as np from scipy.io import wavfile """ model_ids = [ 'suno/bark', ] for model_id in model_ids: model_name = model_id.split('/')[-1] snapshot_download(model_id, local_dir=f'checkpoints/{model_name}') from TTS.tts.configs.bark_config import BarkConfig from TTS.tts.models.bark import Bark #os.environ['CUDA_VISIBLE_DEVICES'] = '1' config = BarkConfig() model = Bark.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True) """ from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) def infer(prompt, input_wav_file): print("SAVING THE AUDIO FILE TO WHERE IT BELONGS") # Path to your WAV file source_path = input_wav_file # Destination directory destination_directory = "bark_voices" # Extract the file name without the extension file_name = os.path.splitext(os.path.basename(source_path))[0] # Construct the full destination directory path destination_path = os.path.join(destination_directory, file_name) # Create the new directory os.makedirs(destination_path, exist_ok=True) # Move the WAV file to the new directory shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) """ text = prompt print("SYNTHETIZING...") # with random speaker #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None) # cloning a speaker. # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz` output_dict = model.synthesize( text, config, speaker_id=f"{file_name}", voice_dirs="bark_voices/", gpu=True ) print(output_dict) sample_rate = 24000 # Replace with the actual sample rate print("WRITING WAVE FILE") wavfile.write( 'output.wav', sample_rate, output_dict['wav'] ) """ tts.tts_to_file(text=prompt, file_path="output.wav", voice_dir="bark_voices/", speaker=f"{file_name}") # List all the files and subdirectories in the given directory contents = os.listdir(f"bark_voices/{file_name}") # Print the contents for item in contents: print(item) tts_video = gr.make_waveform(audio="output.wav") return "output.wav", tts_video, f"bark_voices/{file_name}/{contents[1]}" css = """ #col-container {max-width: 580px; margin-left: auto; margin-right: auto;} img[src*='#center'] { display: block; margin: auto; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("""

Instant Voice Cloning

Clone any voice in less than 2 minutes with this Coqui TSS + Bark demo !
Upload a clean 20 seconds WAV file of the voice you want to clone,
type your text-to-speech prompt and hit submit !

[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm.svg#center)](https://huggingface.co/spaces/fffiloni/instant-TTS-Bark-cloning?duplicate=true) """) prompt = gr.Textbox( label="Text to speech prompt" ) audio_in = gr.Audio( label="WAV voice to clone", type="filepath", source="upload" ) submit_btn = gr.Button("Submit") submit_with_npz_btn = gr.Button("Submit 2", visible=False) cloned_out = gr.Audio( label="Text to speech output" ) video_out = gr.Video( label = "Waveform video" ) npz_file = gr.File( label=".npz file" ) submit_btn.click( fn = infer, inputs = [ prompt, audio_in ], outputs = [ cloned_out, video_out, npz_file ] ) demo.queue().launch()